In [49]:
# we now import the package
import simon
from simon.ingestion import OCRIngester
import textwrap
from functions import display_query

In [50]:
# We are only setting up logging here, making Simon extremely verbose while
# muting the warning of most everything else.
# When you are debugging with Simon, this is the recommended verbosity.

import logging as L

LOG_FORMAT = '[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s'
L.basicConfig(format=LOG_FORMAT, level=L.WARNING)
L.getLogger('simon').setLevel(L.DEBUG)

### 1: Simon Setup

In [52]:
# IF YOU HAVE A LOCAL .env set up following the example here 
#   https://github.com/Shabang-Systems/simon/blob/main/.env.example:
context = simon.create_context("test-uid")   
    # the UID here is an arbiturary string, think about it like database tables.
    # Data stored in `AgentContext`s belonging to one UID are not accessible by
    # Simon operations initialized with a context belonging to another UID.


In [None]:
# Otherwise:
context = simon.create_context(uid="test-uid", openai_api_key="sk-your_open_ai_key", # see above for what the UID is
                               db_config={"host": "your db host",
                                          "port": 5432,
                                          "user": "posgres",
                                          "password": "super secure",
                                          "database": "dbname"})


### 2: Datastore

In [None]:
# Simon manages its data with an object named Datastore:

ds = simon.Datastore(context)

In [53]:
# You pass this function an URL (PDF, png, or website are all fine, we can OCR)
# and a "title" identifying the document. 
# Now, we pass a website into the function
doc_hash = ds.store_remote(
    "https://www.math.uci.edu/admission", "UCI Math Admission")

[2023-09-11 21:45:34,989] [simon] [INFO] OCR Ingesting Remote https://www.math.uci.edu/admission...
[2023-09-11 21:45:34,992] [simon] [DEBUG] Hash is found in the cache; we are done.


In [54]:
# String Literals can also be stored into Datastore 
doc_hash2 = ds.store_text("words words words", "title of the words", "source text here")
# Perhaps unsuprisingly, you can forget this document you just created from Simon's
# memory by

ds.delete(doc_hash2)


[2023-09-11 21:47:42,053] [simon] [INFO] Indexing d7abfc8321855beef6696aa4111e7848896d7ae9a7f6dc28c9e598a49d012884...
[2023-09-11 21:47:42,054] [simon] [INFO] Bulk indexing 1 documents...
[2023-09-11 21:47:42,055] [simon] [DEBUG] Identifying already indexed documents...
[2023-09-11 21:47:42,057] [simon] [DEBUG] All of 1 documents are all indexed. Returning...
[2023-09-11 21:47:42,058] [simon] [INFO] Deleting document by the hash of d7abfc8321855beef6696aa4111e7848896d7ae9a7f6dc28c9e598a49d012884!


### 3:Search

In [55]:
# to first create a Search object:
search = simon.Search(context)

In [56]:
# There are four types of search one can do. The most powerful, and the big event
# of Simon is the `query` type search
result = search.query("Is GRE subject test score required?")

[2023-09-11 21:48:32,148] [simon] [INFO] Serving query "Is GRE subject test score required?"...
[2023-09-11 21:48:32,149] [simon] [INFO] Semantic searching for query "('Is GRE subject test score required?',)"...
[2023-09-11 21:48:32,799] [simon] [DEBUG] Final search queries "['Is GRE subject test score required?']"...
[2023-09-11 21:48:32,799] [simon] [DEBUG] fufilling search request for ['Is GRE subject test score required?']...
[2023-09-11 21:48:32,800] [simon] [DEBUG] building queries for ['Is GRE subject test score required?']...
[2023-09-11 21:48:32,800] [simon] [DEBUG] building embeddings for ['Is GRE subject test score required?']...
[2023-09-11 21:48:33,075] [simon] [DEBUG] executing ['Is GRE subject test score required?']...
[2023-09-11 21:48:33,096] [simon] [DEBUG] assembling results for ['Is GRE subject test score required?']...
[2023-09-11 21:48:33,097] [simon] [DEBUG] done with ['Is GRE subject test score required?']...
[2023-09-11 21:48:33,097] [simon] [DEBUG] Results ide

In [57]:
# print the answer of your query

print(textwrap.fill(result['answer'], 50))

The GRE subject test score in Mathematics is
generally required for MS & PhD applicants [14].
However, due to limited test site availability or
other uncontrollable reasons, there is an
exceptional policy that might allow you to submit
your application without it [28]. If your GRE
subject score is in a subject other than math, you
should consult with Professor Solna [9].


In [59]:
# print the corresponding sources for the answer, which are quotes from the original text.
print('Source 14: ', textwrap.fill(result['answer_resources'][14]['quote'], 50), '\n')

print('Source 28: ' ,textwrap.fill(result['answer_resources'][28]['quote'], 50), '\n')

print('Source 9: ' ,textwrap.fill(result['answer_resources'][9]['quote'], 50))

Source 14:  GRE subject exam in Mathematics – Official scores
are required for the Subject GRE exam, taken
within the last five years for MS & PhD
applicants. 

Source 28:  As an exceptional policy, should you not be able
to take this exam for reasons beyond your control,
such as the need for significant travel, we still
encourage you to submit your application to UCI. 

Source 9:  *If you are applying to the Math PhD program, and
you have obtained a Subject GRE score in a subject
other than math, please contact Professor Solna
ksolna@math.uci.edu for further consultation.


### 4: Ingesters

In [60]:
# Simon makes available a suite of ingesters to read all sorts of resources.
# You can use these IN LIEU or IN CONJUNCTION with the datastore ingestion
# example above. They are most helpful with bulk ingestion.
# To replicate the example store operation above using the ingester API:
ingester = OCRIngester(context)
ingester.ingest_remote(
    "https://www.math.uci.edu/sites/default/files/Math%20Graduate%20Student%20Handbook%20UCI_4.pdf", 
    "Graduate Handbook")

[2023-09-11 21:55:19,140] [simon] [INFO] OCR Ingesting Remote https://www.math.uci.edu/sites/default/files/Math%20Graduate%20Student%20Handbook%20UCI_4.pdf...
[2023-09-11 21:55:19,141] [simon] [DEBUG] Hash is found in the cache; we are done.


'a7aa38d16add41e9fab0fbed7d54619bed1ed8ab844f7424cb3e8906073dfbf0'

In [61]:
# Query again after ingestion
result = search.query("What happens if I got B- for Math 210A?")

[2023-09-11 21:55:50,942] [simon] [INFO] Serving query "What happens if I got B- for Math 210A?"...
[2023-09-11 21:55:50,943] [simon] [INFO] Semantic searching for query "('What happens if I got B- for Math 210A?',)"...
[2023-09-11 21:55:51,862] [simon] [DEBUG] Final search queries "['What happens if I got a B- in Math 210A?']"...
[2023-09-11 21:55:51,863] [simon] [DEBUG] fufilling search request for ['What happens if I got a B- in Math 210A?']...
[2023-09-11 21:55:51,863] [simon] [DEBUG] building queries for ['What happens if I got a B- in Math 210A?']...
[2023-09-11 21:55:51,864] [simon] [DEBUG] building embeddings for ['What happens if I got a B- in Math 210A?']...
[2023-09-11 21:55:52,023] [simon] [DEBUG] executing ['What happens if I got a B- in Math 210A?']...
[2023-09-11 21:55:52,036] [simon] [DEBUG] assembling results for ['What happens if I got a B- in Math 210A?']...
[2023-09-11 21:55:52,037] [simon] [DEBUG] done with ['What happens if I got a B- in Math 210A?']...
[2023-09-1

In [62]:
# print the answer
print(textwrap.fill(result['answer'], 50))


If you got a B- for Math 210A, you can request for
an exception as it is used for a course
requirement [3].


In [63]:
# get the source for the answer
print('Source 3: ', textwrap.fill(result['answer_resources'][3]['quote'], 50), '\n')

Source 3:  Generally (1) exception can be requested for a B-
if it is used for a course requirement (MS or
PhD). 



In [64]:
# query again
result = search.query("Do I have to choose an advisor in the end of my first year?")

[2023-09-11 21:56:35,966] [simon] [INFO] Serving query "Do I have to choose an advisor in the end of my first year?"...
[2023-09-11 21:56:35,966] [simon] [INFO] Semantic searching for query "('Do I have to choose an advisor in the end of my first year?',)"...
[2023-09-11 21:56:37,024] [simon] [DEBUG] Final search queries "['Do I need to select an advisor at the end of my first year?']"...
[2023-09-11 21:56:37,024] [simon] [DEBUG] fufilling search request for ['Do I need to select an advisor at the end of my first year?']...
[2023-09-11 21:56:37,024] [simon] [DEBUG] building queries for ['Do I need to select an advisor at the end of my first year?']...
[2023-09-11 21:56:37,025] [simon] [DEBUG] building embeddings for ['Do I need to select an advisor at the end of my first year?']...
[2023-09-11 21:56:37,122] [simon] [DEBUG] executing ['Do I need to select an advisor at the end of my first year?']...
[2023-09-11 21:56:37,138] [simon] [DEBUG] assembling results for ['Do I need to select a

In [65]:
print(textwrap.fill(result['answer'], 50))

No, you don't have to choose an advisor at the end
of your first year. You should choose an advisor
at the latest by the beginning of your 3rd year
[15]. By this time, you should have already
finished your Qualifying exams [16] and have been
exposed to many of the professors and their
research [17].


In [66]:
print('Source 15: ', textwrap.fill(result['answer_resources'][15]['quote'], 50), '\n')

print('Source 17: ' ,textwrap.fill(result['answer_resources'][17]['quote'], 50))

Source 15:  You should choose an advisor at the latest by the
beginning of your 3rd year. 

Source 17:  You should have been exposed to many of the
professors and their research from the graduate
seminar.


#### 5: Display long answer with more sources as HTML

In [67]:
result = search.query("What are the requirements to complete my Ph.D. program?")

[2023-09-11 21:56:56,562] [simon] [INFO] Serving query "What are the requirements to complete my Ph.D. program?"...
[2023-09-11 21:56:56,563] [simon] [INFO] Semantic searching for query "('What are the requirements to complete my Ph.D. program?',)"...
[2023-09-11 21:56:57,094] [simon] [DEBUG] Final search queries "['Ph.D. program requirements']"...
[2023-09-11 21:56:57,095] [simon] [DEBUG] fufilling search request for ['Ph.D. program requirements']...
[2023-09-11 21:56:57,095] [simon] [DEBUG] building queries for ['Ph.D. program requirements']...
[2023-09-11 21:56:57,095] [simon] [DEBUG] building embeddings for ['Ph.D. program requirements']...
[2023-09-11 21:56:57,232] [simon] [DEBUG] executing ['Ph.D. program requirements']...
[2023-09-11 21:56:57,243] [simon] [DEBUG] assembling results for ['Ph.D. program requirements']...
[2023-09-11 21:56:57,244] [simon] [DEBUG] done with ['Ph.D. program requirements']...
[2023-09-11 21:56:57,244] [simon] [DEBUG] Results identified for "('What are

In [45]:
# display the results as html
display_query(result)

Query Result:  /Users/zhijianli/Desktop/test_simon//simon_result.html
