In [34]:
# Import necessary libraries
import requests
import xml.etree.ElementTree as ET
import simon

# Create a Simon context
context = simon.create_context("test-uid")

# Create a Simon datastore
ds = simon.Datastore(context)

# Define the URL to fetch
sitemap_url = "https://www.jemoka.com/sitemap.xml"

# Send a GET request to fetch the sitemap
response = requests.get(sitemap_url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the XML content
    root = ET.fromstring(response.content)

    # Counter to limit output to 5 lines to make sure we get the response
    count = 0

    # Loop through the URLs 
    for url_element in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
        page_url = url_element.text

        # Store each page 
        doc_hash = ds.store_remote(page_url, "Web Page")

        # Print the result
        print(f"Stored page {page_url} with hash {doc_hash}")

        # Increment the counter
        count += 1

        # Limit output to 5 lines cause there are lots of lines 
        if count >= 5:
            break
else:
    print(f"Failed to fetch sitemap. Status code: {response.status_code}")

Stored page https://www.jemoka.com/posts/kbhistudio_meeting_nodes/ with hash 507a1fe409921baa6547a204e8e61d86f5fcbab723d5f55cee76becaf1040169
Stored page https://www.jemoka.com/posts/kbhmaking_qubits_interact/ with hash 4d99a52ba78231e3dba6d1665d3e871457580c11d56b6b1d39688919f6fbbf51
Stored page https://www.jemoka.com/posts/kbhpoint_estimate/ with hash 507a1fe409921baa6547a204e8e61d86f5fcbab723d5f55cee76becaf1040169
Stored page https://www.jemoka.com/posts/kbhproperties_of_the_stable_matching_algorithm/ with hash 7360c35693ea22fc979f664bb22a248fb74a6d83d8c270185ab9a2dd0fde5c13
Stored page https://www.jemoka.com/posts/kbhrnn_notes/ with hash 4cd8f49c0c4c1e9ec10fe325cda7d97306116a0509e3b5b0bcb1a33d3afaead4


# This is how to use Simon to search for information on the stored web pages.

In [36]:
# Import the Simon Search module
from simon.search import Search

# Create a Simon Search object
search = Search(context)

# Define a query
search_query = "what is the main content in this page?"  # You can replace with your query, this is a example

# Perform a search using Simon
search_result = search.query(search_query)

# Access the answer and search results
answer = search_result.get("answer", "No answer found.")
search_results = search_result.get("search_results", [])

# Print the answer
print("Simon's Answer:")
print(answer)

# Print the search results
print("\nSearch Results:")
for index, result in enumerate(search_results, start=1):
    title = result.get("Title", "No title")
    snippet = result.get("snippet", "No snippet")
    print(f"{index}. Title: {title}")
    print(f"   Snippet: {snippet}\n")

Simon's Answer:
The main content of the page is about writing a scientific article [7]. It provides a structure for the article and discusses the goals of each section [10]. The page goes into detail about each section, including the introduction, motivation, background, methods, results, conclusion, and ethics [21].

Search Results:
1. Title: No title
   Snippet: No snippet

2. Title: No title
   Snippet: No snippet

3. Title: No title
   Snippet: No snippet

4. Title: No title
   Snippet: No snippet

5. Title: No title
   Snippet: No snippet



Brainstorm Example below:

In [None]:
# Perform a brainstorm search on a paragraph about sitemap
paragraph = """
The main content of this page is a comprehensive guide on writing your first article, specifically focusing on the field of Applied Machine Learning (AML) [7]. It provides detailed instructions on how to structure the report, with a section-wise discussion including abstract, introduction, methods, results, conclusion, and ethics [13]. The guide also emphasizes the importance of clear, concise, and precise writing [10], and discusses the ethical considerations related to data collection and processing [31].
"""

questions = search.brainstorm(paragraph)

# Print the salient questions generated by the brainstorm search
for i, question in enumerate(questions, start=1):
    print(f"Question {i}: {question}")

Question 1: {'headline': '"Writing AML Article Guide"', 'relavent_input': '\nThe main content of this page is a comprehensive guide on writing your first article', 'resource': {'quote': 'Web Page AML: Your First ArticleHoujun Liu\nAML: Your First Articletable of contentsexemplar we discussedoverall goalsdiscussion per sectionabstractintro/motivation/backgroundmethodsresults/dataconclusion/discussionethicsfrom the expertsHello y’all!', 'chunk': {'text': 'AML: Your First ArticleHoujun Liu\nAML: Your First Articletable of contentsexemplar we discussedoverall goalsdiscussion per sectionabstractintro/motivation/backgroundmethodsresults/dataconclusion/discussionethicsfrom the expertsHello y’all! This quick post about&mldr; writing your first “article” (ahem, MA) for this class. To me, the most rewarding part of our journey together is to be able to support everyone through writing very presentable reports—even if it is on old or simple problems—but in the format from which you can easily jum

Regular Document Semantic Search below:

In [None]:
query = "Are the nodes system independent of the class system?"
results = search.search(query)

# Print the search results
for i, result in enumerate(results, start=1):
    print(f"Result {i}:")
   

Result 1:
Result 2:
Result 3:
Result 4:
Result 5:
Result 6:
Result 7:
Result 8:
Result 9:
Result 10:
Result 11:
Result 12:
Result 13:
Result 14:


Text-Based Search on Document Titles (Autocomplete) below:

In [None]:
# Perform a text-based search on document titles 
query = "nodes"
results = search.autocomplete(query)

# Print the search results (titles only)
for i, result in enumerate(results, start=1):
    print(f"Result {i}: {result['title']}")