In [20]:
PROMPT = """
<|im_start|>system
As a geological expert, your task is to scrutinize the provided dataset for predefined geological relationships. Only the relationships listed below are acceptable:

    "has lithology of" 
    "stratigraphic unit has lithology"
    "has sedimentary structure"
    "lithology has type of"
    "has grains of"
    "has color of"
    "has bedform of"
    "has structure of"
    
Disregard any relationships not explicitly listed, and avoid attributing new meanings to the specified relationships. Apply a rigorous standard in your assessment. All ambigious or divergent relationships must be designated as invalid. For example, relationships regarding time and location should be included. 

Please format your output as follows:

{{
  "reasoning": "Detail your reasoning behind whether or not the relationship is valid",
  "is_valid": "Indicate if the relationship is in the specified list",
  "head": "Specify the subject of the relationship",
  "tail": "Specify the object of the relationship",
  "relationship": "Cite the specific valid relationship from the list."
}}

<|im_end|>
<|im_start|>user
{{
  "head": "massive, miliolid, pelleted, shell-fragment wackestone and mudstone, grainstones",
  "tail": "West Nueces Formation",
  "relationship": "constitutes"
}}
<|im_end|>
<|im_start|>assistant
{{
  "reasoning": "The relationship indicates that the wackestone, mudstone, and grainstones are the lithology of the West Nueces Formation. Thus, the relationship is considered valid.",
  "is_valid": true,
  "head": "West Nueces Formation",
  "tail": "massive, miliolid, pelleted, shell-fragment wackestone and mudstone, grainstones",
  "relationship": "has lithology of"
}}
<|im_end|>
<|im_start|>user
{{
  "head": "Glen Rose Formation", 
  "tail": "Montana", 
  "relationship": "is located in"
}}
<|im_end|>
<|im_start|>assistant
{{
  "reasoning": "The relationship 'is located in' describes a spatial or geographical association rather than a geological relationship pertaining to lithology, sedimentary structures, stratigraphic units, or other specified geological characteristics. Because this relationship is not found in the predefined list, the relationship is invalid.",
  "is_valid": false
}}
<|im_end|>
<|im_start|>user
{{
  "head": "Cross Member", 
  "tail": "Comfort and Spring Garden Members of Castle Hayne Formation", 
  "relationship": "cross correlates with"
}}
<|im_end|>
<|im_start|>assistant
{{
  "reasoning": "The relationship 'cross correlates with' does not directly align with any of the specified valid relationships. The term 'cross correlates with' refers to a stratigraphic correlation between different members or parts of a formation, which is more related to the spatial and temporal relationship between geological units rather than their physical or compositional characteristics. Thus, the relationship is invalid",
  "is_valid": false
}}
<|im_end|>
<|im_start|>user
{input}
<|im_end|>
<|im_start|>assistant
"""

GRAMMAR = r"""
root   ::= object

object ::= 
  "{ \"reasoning\": " string ", \"is_valid\": " ("true" | "false") ", \"head\": " string ", \"tail\": " string ", \"relationship\": " relationship " }"

relationship ::= "\"" ( "has lithology of" 
                       | "has sedimentary structure" 
                       | "stratigraphic unit has lithology" 
                       | "lithology has type of" 
                       | "has grains of" 
                       | "has color of" 
                       | "has bedform of" 
                       | "has structure of" ) "\""

string ::=
  "\"" (
    [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
  )* "\""
"""

In [21]:
"""
root ::= validObject | invalidObject

validObject ::= "{" ws "reasoning" ":" string "," ws "is valid" ":" "true" "," ws "head" ":" string "," ws "tail" ":" string "," ws "relationship" ":" relationship ws "}"

invalidObject ::= "{" ws "reasoning" ":" string "," ws "is valid" ":" "false" ws "}"

relationship ::= "\"" ( "has lithology of" 
                       | "has sedimentary structure" 
                       | "stratigraphic unit (formation) has lithology" 
                       | "lithology has type of" 
                       | "has grains of" 
                       | "has color of" 
                       | "has bedform of" 
                       | "has structure of" ) "\""

string ::=  "\"" (
        [^"\\] |
        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
      )* "\"" space

ws ::= " "?

space ::= " "?
string ::=  "\"" (
        [^"\\] |
        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
      )* "\"" space
boolean ::= ("true" | "false") space
root ::= "{" space "\"reasoning\"" space ":" space string "," space "\"is valid\"" space ":" space boolean "}" space
"""

'\nroot ::= validObject | invalidObject\n\nvalidObject ::= "{" ws "reasoning" ":" string "," ws "is valid" ":" "true" "," ws "head" ":" string "," ws "tail" ":" string "," ws "relationship" ":" relationship ws "}"\n\ninvalidObject ::= "{" ws "reasoning" ":" string "," ws "is valid" ":" "false" ws "}"\n\nrelationship ::= """ ( "has lithology of" \n                       | "has sedimentary structure" \n                       | "stratigraphic unit (formation) has lithology" \n                       | "lithology has type of" \n                       | "has grains of" \n                       | "has color of" \n                       | "has bedform of" \n                       | "has structure of" ) """\n\nstring ::=  """ (\n        [^"\\] |\n        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n      )* """ space\n\nws ::= " "?\n\nspace ::= " "?\nstring ::=  """ (\n        [^"\\] |\n        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])

In [22]:
import pandas as pd
import requests
import json
import ast
from IPython.display import display, HTML 

In [23]:
df = pd.read_csv('data/results_llm_240208.csv')

In [24]:
df.head()

Unnamed: 0,formation_name,paper_id,paragraph,triplet
0,Raysor Formation,55b735f3e13823bd29ba8028,"Cross Member (Santee Limestone) 1. Eocene, mid...","[{'head': 'Cross Member', 'tail': 'Santee Lime..."
1,Raysor Formation,557c3ad1e138239225f86760,The most frequently applied names for Pliocene...,"[{'head': 'Yorktown Formation', 'tail': 'Plioc..."
2,Raysor Formation,55b736e4e13823bd29ba802f,FIGURE 34.-Known distribution of Pliocene loca...,
3,Raysor Formation,58cb666bcf58f10af549ae27,"Such an age range for the phosphate-beds ""faun...","[{'head': 'age range of phosphate-beds fauna',..."
4,Raysor Formation,55b736e4e13823bd29ba802f,Ward indicates that such shells were the sourc...,"[{'head': 'shells', 'tail': 'matrix', 'relatio..."


In [25]:
# Parse the string to a Python object
data = ast.literal_eval(df.iloc[0,3])

# Convert the Python object to a JSON string
print(json.dumps(data))

[{"head": "Cross Member", "tail": "Santee Limestone", "relationship": "is a type of"}, {"head": "Cross Member", "tail": "Eocene, middle (Claibornian)", "relationship": "belongs to"}, {"head": "Cross Member", "tail": "South Carolina", "relationship": "is located in"}, {"head": "Ward, L. W., and others, 1979, Stratigraphic revision of Eocene, Oligocene and lower Miocene formations of South Carolina", "tail": "Cross Member", "relationship": "provides information about"}, {"head": "biomicrite (limestone)", "tail": "Cross Member", "relationship": "is a component of"}, {"head": "grayish-yellow", "tail": "biomicrite (limestone)", "relationship": "describes"}, {"head": "1 m, range 1-41 m", "tail": "biomicrite (limestone)", "relationship": "describes"}, {"head": "bryozoan-brachiopod-bivalve biomicrite, deeply burrowed", "tail": "Cross Member", "relationship": "is a component of"}, {"head": "Moultrie Member (Santee Limestone)", "tail": "Cross Member", "relationship": "unconformably underlies"}, 

In [26]:
valid = 0
total = 0
results_df = pd.DataFrame(
    columns=[
        "formation_name",
        "paper_id",
        "head",
        "tail",
        "relationship",
        "reasoning",
        "is_valid",
    ]
)
for index, row in df.iterrows():
    try:
        relation_list = ast.literal_eval(row.triplet)
    except:
        print("Error: Could not parse input.")
        continue

    for relation in relation_list:
        try:
            data = {
                "temperature": 0.0,
                "prompt": PROMPT.format(input=json.dumps(relation)),
                "grammar": GRAMMAR,
            }

            response = requests.post("http://127.0.0.1:8010/completion", json=data)
            response_json = json.loads(response.json()["content"])

            # print(json.dumps(relation, indent=2))
            # print(json.dumps(response_json, indent=2))
            
            results_df.loc[len(results_df.index)] = (
                row.formation_name,
                row.paper_id,
                response_json["head"],
                response_json["tail"],
                response_json["relationship"],
                response_json["reasoning"],
                response_json["is_valid"],
            )

            if response_json["is_valid"]:
                valid += 1

            total += 1
            if total % 10 == 0:
                print(f"Processed {valid}/{total} triplets. ")
                results_df.to_csv("output.csv", index=False)

        except KeyError:
            print("Error: LLM output error. ")
            continue

{
  "head": "Cross Member",
  "tail": "Santee Limestone",
  "relationship": "is a type of"
}
{
  "reasoning": "The relationship 'is a type of' suggests that Cross Member is a classification or subcategory of Santee Limestone. This relationship can be considered valid as it pertains to the 'lithology has type of' relationship from the specified list.",
  "is_valid": true,
  "head": "Cross Member",
  "tail": "Santee Limestone",
  "relationship": "has lithology of"
}
{
  "head": "Cross Member",
  "tail": "Eocene, middle (Claibornian)",
  "relationship": "belongs to"
}
{
  "reasoning": "The relationship 'belongs to' indicates that the Cross Member is part of a specific time period, Eocene middle (Claibornian). This relationship pertains to geological time and not directly to lithology, sedimentary structures, or other specified geological characteristics. Therefore, it is not explicitly listed in the provided relationships but can be considered valid as it relates to a geological context."

KeyboardInterrupt: 