# Demonstrate provenance system

In [1]:
from pydantic import BaseModel

from text2graph.xdd import USGSRetriever
from text2graph.llm import ask_llm
from text2graph.prompt import PromptHandlerV3
from text2graph.alignment import AlignmentHandler
from text2graph.schema import Provenance
from text2graph.gkm.gkm import triplet_to_rdf, graph_to_ttl_string

import logging

logging.basicConfig(level=logging.INFO)

# run RAG Hybrid endpoint with given query

In [2]:
retriever = USGSRetriever()
paragraph = retriever.query_ask_xdd("Waldron Shale geological formation Silurian")
paragraph

Paragraph(paper_id='5da51b57998e17af8253d343', preprocessor_id='haystack_v0.0.2', doc_type='paragraph', topic_list=['criticalmaas', 'dolomites'], text_content='The Silurian system in the Sequatchie Valley is composed of a thinto medium-bedded alternation of shale~ limestone, siltstone, sandstonej and beds of red iron ore., In the southern part of the valley the succession is more sandy and ferruginouso These beds of Silurian age lie on limestone or shale of Ordovician age, and are overlain disconformably by the Upper\nDevonian Chattanooga shaleo The Ordovician and Silurian Rockwood group of Tennessee is not everywhere easily divisible into its two componentso The chief lithologic difference is the relative proportion of red shale and limestone: the Ordovician Sequatchie formation is characterized by more of the red shale, and the Silurian beds by more of the limestoneo In Alabama the Silurian Red Mountain formation lies on limestone of Ordovician age o The Silurian succession ranges in

# convert paragaph to graph request/ API input


In [3]:
class GraphRequest(BaseModel):
    text: str
    model: str = "mixtral"
    doc_ids: list[str] | None = None
    provenance: Provenance | None = None
    

gr = GraphRequest(
    text=paragraph.text_content,
    model="gpt-4-turbo-preview",
    doc_ids = [paragraph.paper_id],
    provenance=paragraph.provenance
)

gr

GraphRequest(text='The Silurian system in the Sequatchie Valley is composed of a thinto medium-bedded alternation of shale~ limestone, siltstone, sandstonej and beds of red iron ore., In the southern part of the valley the succession is more sandy and ferruginouso These beds of Silurian age lie on limestone or shale of Ordovician age, and are overlain disconformably by the Upper\nDevonian Chattanooga shaleo The Ordovician and Silurian Rockwood group of Tennessee is not everywhere easily divisible into its two componentso The chief lithologic difference is the relative proportion of red shale and limestone: the Ordovician Sequatchie formation is characterized by more of the red shale, and the Silurian beds by more of the limestoneo In Alabama the Silurian Red Mountain formation lies on limestone of Ordovician age o The Silurian succession ranges in thickness from a few feet near the north end of the valley to about 200 feet south of Guntersville, Alao', model='gpt-4-turbo-preview', doc_

# Simulate API call

In [4]:
triplets_result = await ask_llm(
    text=gr.text,
    prompt_handler=PromptHandlerV3(),
    model=gr.model,
    temperature=0,
    to_triplets=True,
    alignment_handler=AlignmentHandler.load(
            "data/known_entity_embeddings/all-MiniLM-L6-v2"
        ),
    doc_ids=gr.doc_ids,
    provenance=gr.provenance,
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:httpx:HTTP Request: GET https://macrostrat.org/api/defs/strat_names?strat_name=Chattanooga "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://macrostrat.org/api/defs/strat_names?strat_name=Sequatchie "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://macrostrat.org/api/defs/strat_names?strat_name=Silurian%20system "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://macrostrat.org/api/defs/strat_names?strat_name=Red%20Mountain "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://macrostrat.org/api/defs/strat_names?strat_name=Rockwood%20group "HTTP/1.1 200 OK"
INFO:root:No records found for stratigraphy 'Silurian system'
INFO:root:No records found for stratigraphy 'Rockwood group'
INFO:httpx:HTTP Request: GET https://serpapi.com/search.json?engine=google_maps&q=Sequatchie%20Valley&api_key=7e3bd7b2ab5828e1e23fa6e5bf36775d6de81d9f8b7667b5ce28d2cd4f535e39 "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://serpapi.com/search.json?engine=google_maps&q=Tennessee&api_key=7

In [5]:
# View results
# triplets_result.model_dump()

# Persist to disk

In [28]:
from text2graph.gkm.gkm import triplet_to_rdf, graph_to_ttl_string

import logging

logging.basicConfig(level=logging.INFO)


from text2graph.schema import GraphOutput
from pathlib import Path
import json
from uuid import UUID 
from datetime import datetime
#GraphOutput(**triplets_result.model_dump())

class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, UUID):
            # if the obj is uuid, we simply return the value of uuid
            return obj.hex
        if isinstance(obj, datetime):
            return obj.isoformat()
        return json.JSONEncoder.default(self, obj)


testing_triplets_path = Path("data/testing_triplets.json")
if not testing_triplets_path.exists():
    with open(testing_triplets_path, 'w') as f:
        json.dump(triplets_result.model_dump(), f, cls=MyEncoder)

with open(testing_triplets_path, 'r') as f:
    testing_triplets = GraphOutput(**json.load(f))


# show example hydrated Stratigraphy

In [30]:
example_triplet = testing_triplets.triplets[1]
example_triplet.object

Stratigraphy(strat_name='Chattanooga', strat_name_long='Chattanooga Formation', rank='Fm', strat_name_id=7412, concept_id=1000, bed='', bed_id=0, mbr='', mbr_id=0, fm='Chattanooga', fm_id=7412, subgp='', subgp_id=0, gp='', gp_id=0, sgp='', sgp_id=0, b_age=393.3, t_age=351.5, b_period='Devonian', t_period='Carboniferous', c_interval='', t_units=1, ref_id=19, provenance=Provenance(id=UUID('87599786-af32-4d47-8e48-2fbe4194e774'), source_name='Macrostrat', source_url='https://macrostrat.org/api/defs/strat_names?strat_name_id=7412', source_version=2, requested=datetime.datetime(2024, 4, 12, 16, 1, 12, 175105), additional_values=None, previous=Provenance(id=UUID('c2010a5c-10fe-4e63-ad51-50df68ea964d'), source_name='OpenAIModel', source_url=None, source_version='gpt-4-turbo-preview', requested=datetime.datetime(2024, 4, 12, 16, 1, 12, 175105), additional_values={'temperature': 0, 'prompt': 'v3', 'doc_ids': ['5da51b57998e17af8253d343']}, previous=Provenance(id=UUID('15096d05-51bc-4718-99ce-3a5

# Create TTL formatted triple output 

In [31]:
output_ttl_path = Path("data/test_triplet.ttl")
print(
    graph_to_ttl_string(
        g=triplet_to_rdf(triplet=example_triplet),
        filename=output_ttl_path
    )
)

@prefix gsgu: <https://w3id.org/gso/geologicunit/> .
@prefix gsoc: <https://w3id.org/gso/1.0/common/> .
@prefix gsog: <https://w3id.org/gso/geology/> .
@prefix gspr: <https://w3id.org/gso/geologicprocess/> .
@prefix gst: <https://w3id.org/gso/geologictime/> .
@prefix msl: <https://macrostrat.org/lexicon/> .
@prefix pav: <http://purl.org/pav/> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xdd: <https://xdd.wisc.edu/lexicon/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

msl:ChattanoogaFormation a gsgu:Formation ;
    rdfs:label "Chattanooga Formation"@en ;
    prov:wasGeneratedBy xdd:OpenAIModel_query ;
    gsoc:hasQuality [ a gsoc:SpatialLocation ;
            gsoc:hasValue [ a gsoc:WKT_Value ;
                    prov:wasGeneratedBy xdd:SERPAPIQuery ;
                    gsoc:hasDataValue "( POINT -85.6180227 35.0195229 )" ;
                    gsoc:hasReferenceSystem <https://epsg.io/4326> ],
                [ a