In [24]:
from IPython.display import Markdown
import percolate as p8
from percolate.utils.ingestion import web
from percolate.utils.parsing import json_loads
from pathlib import Path
from percolate.models import AbstractModel

In [22]:
# from percolate.api.routes.integrations.services import GmailService

# g = GmailService()
# data = await g.fetch_latest_emails(limit=500, fetch_limit=10000, domain_filter='substack.com')
# import json
# with open("/Users/sirsh/Downloads/substacl_emails.json", 'w') as f:
#     json.dump(data,f)

In [41]:
with open("/Users/sirsh/Downloads/substacl_emails.json", 'r') as f:
    data = json.load(f)

In [67]:
import datetime
from percolate.models import AbstractModel
from percolate.models import DefaultEmbeddingField
from pydantic import BaseModel, Field
import typing
import uuid
from dateparser import parse


class IndexingAgent(AbstractModel):
    """You are a small indexing agent that extract concepts and keywords from content and provides a brief conceptual summary.
- give a three bullet point concept summary as content. if you are given context use that to contextualize the summary otherise make it general
- provide concept "graph paths" these are topic keyword nodes from least specific to more general i.e. LLMs/AI and not AI/LLMs because LLMs a sub category of AI
- extract a small number of literal keywords from the text all titlecased. Literal keywords must appear exactly but not necessarily in the same casing in the text as words that can be searched
- you can add a general category

Use your model structure to respond strictly in unfenced json and without additional commentary.
You do not need to use any functions.
you do not need to load any functions
"""
    id: typing.Optional[uuid.UUID| str] = Field("The id is generated as a hash of the required uri and ordinal")  
    name: typing.Optional[str] = Field(None, description="A short content name - non unique - for example a friendly label for a chunked pdf document or web page title")
    category: typing.Optional[str] = Field(None, description="A content category")
    content: str = DefaultEmbeddingField(description="The chunk of content from the source")
    ordinal: int = Field(0, description="For chunked content we can keep an ordinal")
    concept_graph_paths: typing.Optional[typing.List[str]] = Field(None, description="Track all paths extracted by an agent as used to build the KG")
    keywords: typing.Optional[typing.List[str]] = Field(None, description="Keywords are extract as-is from text. You should not use aliases here but words that can be searched in the text (ignoring case). You should mix persons, places and strong concepts or rare words here. Please mix these categories")

agent = p8.Agent(IndexingAgent, allow_help=False)
agent
    
class Email(AbstractModel):
    """"""
    id: typing.Optional[uuid.UUID| str]
    snippet:str
    sender:str
    subject:str
    date: datetime.datetime = Field
    content: str = DefaultEmbeddingField(description="The chunk of content from the source")
    concept_graph_paths: typing.Optional[typing.List[str]] = Field(None, description="Track all paths extracted by an agent as used to build the KG")
    keywords: typing.Optional[typing.List[str]] = Field(None, description="Keywords are extract as-is from text. You should not use aliases here but words that can be searched in the text (ignoring case). You should mix persons, places and strong concepts or rare words here. Please mix these categories")
    summary:typing.Optional[str] = Field(None, description="Summary of the content")
    
    @classmethod
    def from_record(cls,d):
        
        d['sender'] = d['from']
        d['date'] = parse(str(d['date']))
        d['id'] = p8.utils.make_uuid(d['id'])
        
        try:
            jdata = agent.run(f"I am interested in classyfing my interests in AI and culture. Please use the data below and create the concept index - ```{d['content']}```")
            jdata = json_loads(jdata)
            d['summary'] = jdata['content']
            d['concept_graph_paths'] = jdata['concept_graph_paths']
            d['keywords'] = jdata['keywords']
        except Exception as ex:
            print(ex)
        return cls(**d)


#p8.repository(Email).register()
records = [Email.from_record(d) for d in data]
p8.repository(Email).update_records(records)

In [19]:
# implement ranking algorithms
# create the node stats table / matrix for unique nodes and their average weights -> it should be possible to freeze the retrieval so that a tool can also retrieve it. this is a critical deisng element

# i need a tool for quickly testing and evaluating methods as opposed to a single method

# clustering of what we have and compare with the graph

# transcripts [knowing the % through a set we are]

# a way to 

In [24]:
url = "https://www.ai-supremacy.com/p/is-agi-a-hoax-of-silicon-valley?utm_source=post-email-title&publication_id=396235&post_id=159461783&utm_campaign=email-post-title&isFreemail=true&r=55y4t4&triedRedirect=true&utm_medium=email"

markdown = web.fetch_web_markdown(url)

#Markdown(markdown)


In [27]:
d = agent.run(f"I am interested in teaching AI. Please use the data below and create the concept index - ```{markdown}```")
json_loads(d)

In [19]:
txt = """“I am,” said he, with a firm voice.

“And never allow yourself to be blinded by prejudice?”

“I hope not.”

“It is particularly incumbent on those who never change their opinion, to be secure of judging properly at first.”

“May I ask to what these questions tend?”

“Merely to the illustration of your character,” said she, endeavouring to shake off her gravity. “I am trying to make it out.”

“And what is your success?{120}”

She shook her head. “I do not get on at all. I hear such different accounts of you as puzzle me exceedingly.”

“I can readily believe,” answered he, gravely, “that reports may vary greatly with respect to me; and I could wish, Miss Bennet, that you were not to sketch my character at the present moment, as there is reason to fear that the performance would reflect no credit on either.”

“But if I do not take your likeness now, I may never have another opportunity.”

“I would by no means suspend any pleasure of yours,” he coldly replied. She said no more, and they went down the other dance and parted in silence; on each side dissatisfied, though not to an equal degree; for in Darcy’s breast there was a tolerably powerful feeling towards her, which soon procured her pardon, and directed all his anger against another.

They had not long separated when Miss Bingley came towards her, and, with an expression of civil disdain, thus accosted her,—

“So, Miss Eliza, I hear you are quite delighted with George Wickham? Your sister has been talking to me about him, and asking me a thousand questions; and I find that the young man forgot to tell you, among his other communications, that he was the son of old Wickham, the late Mr. Darcy’s steward. Let me recommend you, however, as a friend, not to give implicit confidence to all his assertions; for, as to Mr. Darcy’s using him ill, it is perfectly false: for, on the contrary, he has been always remarkably kind to him, though George Wickham has treated Mr. Darcy in a most infamous manner. I do not know the particulars, but I know very well that Mr. Darcy is not in the least to{121} blame; that he cannot bear to hear George Wickham mentioned; and that though my brother thought he could not well avoid including him in his invitation to the officers, he was excessively glad to find that he had taken himself out of the way. His coming into the country at all is a most insolent thing, indeed, and I wonder how he could presume to do it. I pity you, Miss Eliza, for this discovery of your favourite’s guilt; but really, considering his descent, one could not expect much better.”

“His guilt and his descent appear, by your account, to be the same,” said Elizabeth, angrily; “for I have heard you accuse him of nothing worse than of being the son of Mr. Darcy’s steward, and of that, I can assure you, he informed me himself.”

“I beg your pardon,” replied Miss Bingley, turning away with a sneer. “Excuse my interference; it was kindly meant.”

“Insolent girl!” said Elizabeth to herself. “You are much mistaken if you expect to influence me by such a paltry attack as this. I see nothing in it but your own wilful ignorance and the malice of Mr. Darcy.” She then sought her eldest sister, who had undertaken to make inquiries on the same subject of Bingley. Jane met her with a smile of such sweet complacency, a glow of such happy expression, as sufficiently marked how well she was satisfied with the occurrences of the evening. Elizabeth instantly read her feelings; and, at that moment, solicitude for Wickham, resentment against his enemies, and everything else, gave way before the hope of Jane’s being in the fairest way for happiness.

“I want to know,” said she, with a countenance no less smiling than her sister’s, “what you have learnt{122} about Mr. Wickham. But perhaps you have been too pleasantly engaged to think of any third person, in which case you may be sure of my pardon.”

“No,” replied Jane, “I have not forgotten him; but I have nothing satisfactory to tell you. Mr. Bingley does not know the whole of his history, and is quite ignorant of the circumstances which have principally offended Mr. Darcy; but he will vouch for the good conduct, the probity and honour, of his friend, and is perfectly convinced that Mr. Wickham has deserved much less attention from Mr. Darcy than he has received; and I am sorry to say that by his account, as well as his sister’s, Mr. Wickham is by no means a respectable young man. I am afraid he has been very imprudent, and has deserved to lose Mr. Darcy’s regard.”

“Mr. Bingley does not know Mr. Wickham himself.”

“No; he never saw him till the other morning at Meryton.”

“This account then is what he has received from Mr. Darcy. I am perfectly satisfied. But what does he say of the living?”

“He does not exactly recollect the circumstances, though he has heard them from Mr. Darcy more than once, but he believes that it was left to him conditionally only.”

“I have not a doubt of Mr. Bingley’s sincerity,” said Elizabeth warmly, “but you must excuse my not being convinced by assurances only. Mr. Bingley’s defence of his friend was a very able one, I dare say; but since he is unacquainted with several parts of the story, and has learnt the rest from that friend himself, I shall venture still to think of both gentlemen as I did before.”

She then changed the discourse to one more gratifying{123} to each, and on which there could be no difference of sentiment. Elizabeth listened with delight to the happy though modest hopes which Jane entertained of Bingley’s regard, and said all in her power to heighten her confidence in it. On their being joined by Mr. Bingley himself, Elizabeth withdrew to Miss Lucas; to whose inquiry after the pleasantness of her last partner she had scarcely replied, before Mr. Collins came up to them, and told her with great exultation, that he had just been so fortunate as to make a most important discovery.

“I have found out,” said he, “by a singular accident, that there is now in the room a near relation to my patroness. I happened to overhear the gentleman himself mentioning to the young lady who does the honours of this house the names of his cousin Miss De Bourgh, and of her mother, Lady Catherine. How wonderfully these sort of things occur! Who would have thought of my meeting with—perhaps—a nephew of Lady Catherine de Bourgh in this assembly! I am most thankful that the discovery is made in time for me to pay my respects to him, which I am now going to do, and trust he will excuse my not having done it before. My total ignorance of the connection must plead my apology.”

“You are not going to introduce yourself to Mr. Darcy?”

“Indeed I am. I shall entreat his pardon for not having done it earlier. I believe him to be Lady Catherine’s nephew. It will be in my power to assure him that her Ladyship was quite well yesterday se’nnight.”

Elizabeth tried hard to dissuade him from such a scheme; assuring him that Mr. Darcy would consider his addressing him without introduction as an impertinent{124} freedom, rather than a compliment to his aunt; that it was not in the least necessary there should be any notice on either side, and that if it were, it must belong to Mr. Darcy, the superior in consequence, to begin the acquaintance. Mr. Collins listened to her with the determined air of following his own inclination, and when she ceased speaking, replied thus,—

“My dear Miss Elizabeth, I have the highest opinion in the world of your excellent judgment in all matters within the scope of your understanding, but permit me to say that there must be a wide difference between the established forms of ceremony amongst the laity and those which regulate the clergy; for, give me leave to observe that I consider the clerical office as equal in point of dignity with the highest rank in the kingdom—provided that a proper humility of behaviour is at the same time maintained. You must, therefore, allow me to follow the dictates of my conscience on this occasion, which lead me to perform what I look on as a point of duty. Pardon me for neglecting to profit by your advice, which on every other subject shall be my constant guide, though in the case before us I consider myself more fitted by education and habitual study to decide on what is right than a young lady like yourself;” and with a low bow he left her to attack Mr. Darcy, whose reception of his advances she eagerly watched, and whose astonishment at being so addressed was very evident. Her cousin prefaced his speech with a solemn bow, and though she could not hear a word of it, she felt as if hearing it all, and saw in the motion of his lips the words “apology,” “Hunsford,” and “Lady Catherine de Bourgh.” It vexed her to see him expose himself to such a man. Mr. Darcy was eyeing him with un{125}restrained wonder; and when at last Mr. Collins allowed him to speak, replied with an air of distant civility. Mr. Collins, however, was not discouraged from speaking again, and Mr. Darcy’s contempt seemed abundantly increasing with the length of his second speech; and at the end of it he only made him a slight bow, and moved another way: Mr. Collins then returned to Elizabeth.

“I have no reason, I assure you,” said he, “to be dissatisfied with my reception. Mr. Darcy seemed much pleased with the attention. He answered me with the utmost civility, and even paid me the compliment of saying, that he was so well convinced of Lady Catherine’s discernment as to be certain she could never bestow a favour unworthily. It was really a very handsome thought. Upon the whole, I am much pleased with him.”

As Elizabeth had no longer any interest of her own to pursue, she turned her attention almost entirely on her sister and Mr. Bingley; and the train of agreeable reflections which her observations gave birth to made her perhaps almost as happy as Jane. She saw her in idea settled in that very house, in all the felicity which a marriage of true affection could bestow; and she felt capable, under such circumstances, of endeavouring even to like Bingley’s two sisters. Her mother’s thoughts she plainly saw were bent the same way, and she determined not to venture near her, lest she might hear too much. When they sat down to supper, therefore, she considered it a most unlucky perverseness which placed them within one of each other; and deeply was she vexed to find that her mother was talking to that one person (Lady Lucas) freely, openly, and of nothing else but of her expectation that Jane would be soon married to Mr. Bingley. It was an animating subject, and Mrs. Bennet{126} seemed incapable of fatigue while enumerating the advantages of the match. His being such a charming young man, and so rich, and living but three miles from them, were the first points of self-gratulation; and then it was such a comfort to think how fond the two sisters were of Jane, and to be certain that they must desire the connection as much as she could do. It was, moreover, such a promising thing for her younger daughters, as Jane’s marrying so greatly must throw them in the way of other rich men; and, lastly, it was so pleasant at her time of life to be able to consign her single daughters to the care of their sister, that she might not be obliged to go into company more than she liked. It was necessary to make this circumstance a matter of pleasure, because on such occasions it is the etiquette; but no one was less likely than Mrs. Bennet to find comfort in staying at home at any period of her life. She concluded with many good wishes that Lady Lucas might soon be equally fortunate, though evidently and triumphantly believing there was no chance of it.

In vain did Elizabeth endeavour to check the rapidity of her mother’s words, or persuade her to describe her felicity in a less audible whisper; for to her inexpressible vexation she could perceive that the chief of it was overheard by Mr. Darcy, who sat opposite to them. Her mother only scolded her for being nonsensical.

“What is Mr. Darcy to me, pray, that I should be afraid of him? I am sure we owe him no such particular civility as to be obliged to say nothing he may not like to hear.”"""

In [22]:
d = agent.run(f"I am interested in teaching social beahvior. Please use the data below and create the concept index - ```{txt}```")
json_loads(d)

# TODO
- flaw that it tries to ask for help it doesnt need and also it triest to activate self. Fix this


In [293]:
import re
import requests
from bs4 import BeautifulSoup
from pydantic import BaseModel
from typing import List
from pydantic import model_validator

agent = p8.Agent(IndexingAgent, allow_help=False)

# Define our pydantic model for a chapter
class Chapter(BaseModel):
    id: typing.Optional[uuid.UUID| str]
    content: str = DefaultEmbeddingField(description="The chunk of content from the source")
    url: str
    ordinal: int
    name: str
    concept_graph_paths: typing.Optional[typing.List[str]] = Field(None, description="Track all paths extracted by an agent as used to build the KG")
    keywords: typing.Optional[typing.List[str]] = Field(None, description="Keywords are extract as-is from text. You should not use aliases here but words that can be searched in the text (ignoring case). You should mix persons, places and strong concepts or rare words here. Please mix these categories")

        
    @model_validator(mode='before')
    @classmethod
    def _a(cls,values):
        try:
            jdata = agent.run(f"I am interested in classyfing my interests in AI and culture. Please use the data below and create the concept index - ```{values['content']}```")
            jdata = json_loads(jdata)
            values['summary'] = jdata['content']
            values['concept_graph_paths'] = jdata['concept_graph_paths']
            values['keywords'] = jdata['keywords']
        except Exception as ex:
            print(ex)
        return values
    
class Section(Chapter):
    """You are an agent that can look at chunked sections of a book. 
       Books are numbered sequentially BOOK_CHAPTER_SECTION.
       Your data will contain `related_paths` which you should use for further analysis by looking only at the target node in the tuple.
       
       Check TARGET entities mentioned in the related paths `path_node_labels` and figure out how to get more details from those labelled entiries.
       Select a fraction of the most relevant themes to the question and expand on why these themes are relevant to the question.
       
       Always add entity references to the sources you used so we can check them. For example if you look up entity A or read from entity B you should add them in brackets as references.
       
       Provide a DEPTH score for how complete your answer is. Poor answers do not explore related entities and only use the entities returned in the first pass.
       For example, if you ask a question and get back entity content, the related paths show other entities that you could consult.
       Good answers must check the target nodes in the `path_node_labels` tuplues. You should reduce your score if you use the entity lookup function to retrieve entities that you already have.
       
       Give yourself an EFFICIENCY score based on how selective you were in looking up related entities. You dont need to load every entity and you should pick the top 50% based on how closely the topics relate to the users question.
    
       Respond with your final anwser without asking the user for intervention.
    """
    section_ordinal: int
        
        
    @classmethod
    def deep_search(cls, question: str):
        """
        Deep search first does a semantic search and returns a number of named entities. It also provide related paths to other entities related by various themes.
        You shoudl use the data to find related entities and lookup only these most interesting related entities. 
        The paths take the form of tuples from the source node that you already have to the target node along various themes in `path_node_labels`
         
        Args:
            question: a detailed question to semantically search content
        """
        
        return p8.repository(cls).execute("""Select * from p8.deep_search2(%s,%s)""", data=(question, 'public.Section'))
        

def fetch_html(url: str) -> str:
    response = requests.get(url)
    response.raise_for_status()
    return response.text

L = 2500

def parse_sections(html: str, base_url: str):
    soup = BeautifulSoup(html, "html.parser")
    body = soup.body

    chapter_heading_pattern = re.compile(r'^CHAPTER\s+(\d+)', re.IGNORECASE)

    current_chapter = None
    chapter_content = []
    section_ordinal = 1  # For tracking the section within a chapter

    # Iterate over all descendants of the body
    for elem in body.descendants:
        if elem.name and elem.name.lower() in ['h1', 'h2', 'h3']:
            text = elem.get_text(strip=True)
            match = chapter_heading_pattern.match(text)

            if match:
                # If a new chapter is found, yield sections of the current chapter
                if current_chapter is not None:
                    # Split the chapter into sections based on content length
                    full_content = "\n".join(chapter_content).strip()
                    while len(full_content) > L:
                        section_content = full_content[:L]
                        full_content = full_content[L:]

                        yield current_chapter, section_ordinal, section_content, base_url

                        section_ordinal += 1

                    # Yield any remaining content
                    if full_content:
                        yield current_chapter, section_ordinal, full_content, base_url

                # Start a new chapter
                current_chapter = int(match.group(1))
                print(current_chapter)
                chapter_content = [text]  # Start with the chapter heading as content
                section_ordinal = 1  # Reset section ordinal for the new chapter

        else:
            # For NavigableStrings, add them to chapter content
            if current_chapter is not None and elem.string and elem.string.strip():
                chapter_content.append(elem.string.strip())
                

    # After the last chapter, yield remaining sections
    if current_chapter is not None and chapter_content:
        full_content = "\n".join(chapter_content).strip()
        while len(full_content) > L:
            section_content = full_content[:L]
            full_content = full_content[L:]

            yield current_chapter, section_ordinal, section_content, base_url

            section_ordinal += 1

        if full_content:
            yield current_chapter, section_ordinal, full_content, base_url


def construct_section(chapter_ordinal, section_ordinal, section_content, base_url):
    # Construct the Section object
    section_obj = Section(
        id=str(p8.utils.make_uuid({'url': base_url, 'chapter_ordinal': chapter_ordinal, 'section_ordinal': section_ordinal})),
        url=base_url,
        name = f"book_{chapter_ordinal}_{section_ordinal}",
        ordinal=chapter_ordinal,
        chapter_ordinal=chapter_ordinal,
        section_ordinal=section_ordinal,
        content=section_content.strip()
    )
    return section_obj

sections = []
def parse_chapters(html: str, base_url: str):

    
    # Use parse_sections to get each section's data, and then construct the Section objects
    for chapter_ordinal, section_ordinal, section_content, base_url in parse_sections(html, base_url):
        section_obj = construct_section(chapter_ordinal, section_ordinal, section_content, base_url)
        sections.append(section_obj)
        
    
    return sections

# url = "https://www.gutenberg.org/cache/epub/2701/pg2701-images.html"
# html = fetch_html(url)
# sections = parse_chapters(html, base_url=url)

sea=p8.Agent(Section)
sea

In [296]:
a = sea("(using deep search) The carpenter was given orders and so was the blacksmith on the same task. What orders where they given and what happened re: the carpenter in the immidately following chapter. Can you eloborate in great detail on some of the themes given based on deeper lookups via the `related paths`",
       language_model='claude-3-7-sonnet-20250219',
        limit=10
       )
#a = sea("If you found inforamtion in book_121_7 being the last section of that chapter, what do you think would be the entity for the following chapter")
Markdown(a)

In [291]:
from percolate.services import PostgresService
from percolate.models.p8 import LanguageModelApi
from percolate.utils import make_uuid
PostgresService().update_records(LanguageModelApi(id = make_uuid('claude-3-7-sonnet-20250219'), name = 'claude-3-7-sonnet-20250219', scheme='anthropic', completions_uri='https://api.anthropic.com/v1/messages', token_env_key='ANTHROPIC_API_KEY'))

In [255]:
json.loads(sea.messages.data[1]['content'])

In [170]:

a = sea("Please explore harpoons in relation to sperm whales and grooming and explain the reasoning you used to get as much detail as possibe. ")
Markdown(a)

In [182]:
json.loads(sea.messages.data[1]['content'])

In [140]:
repo = p8.repository(Section)
repo.register()
repo.update_records(sections)

In [142]:
import umap
from percolate.services import PostgresService

pg = PostgresService()
data =   pg.execute("""SElect a.*, b.name, b.concept_graph_paths, b.keywords, b.name from p8_embeddings."public_Section_embeddings" a join public."Section" b on b.id = a.source_record_id """)
embeddings = [d['embedding_vector'] for d in data]
ids = [d['id'] for d in data]
embeddings = np.array([np.fromstring(e[1:-1], sep=',', dtype=np.float32) for e in embeddings])
data = pd.DataFrame(data)
keywords = data['keywords']
names = data['name']
paths = data['concept_graph_paths']

In [143]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=3, algorithm='auto', metric='euclidean')
knn.fit(embeddings)
distances, indices = knn.kneighbors(embeddings)
import pandas as pd
a = pd.DataFrame(indices).join(pd.DataFrame(distances),rsuffix='d')
a['ids'] = ids
a['name'] = names
a['keywords'] = keywords
a

In [144]:
umap_model = umap.UMAP(n_components=2, random_state=42)

# Fit and transform the embeddings
embedding_2d = umap_model.fit_transform(embeddings)


In [145]:
from matplotlib import pyplot as plt
plt.figure(figsize=(40, 40))
plt.scatter(embedding_2d[:, 0], embedding_2d[:, 1], cmap='Spectral', s=5)
for x, y, label in zip(embedding_2d[:, 0], embedding_2d[:, 1], paths):
    plt.annotate(label, (x, y), textcoords="offset points", xytext=(0, 10), ha='center', fontsize=8)

plt.title('2D UMAP Projection of Embeddings')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
