# Extract Geological Data from Documents

## 1. Environment Initialization and Function Declaration

In [36]:
import os
import openai
import textract
import tiktoken
from dotenv import load_dotenv
import os
import regex as re
from tqdm import tqdm
import pandas as pd
import csv
import random
import logging
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

load_dotenv()

os.environ["http_proxy"] = "http://10.10.1.3:10000"
os.environ["https_proxy"] = "http://10.10.1.3:10000"

# Load your API key from an environment variable or secret management service
openai.api_key = os.getenv("OPENAI_API_KEY")
os.environ['OPENAI_API_KEY'] = openai.api_key


# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def create_chunks(text, n, tokenizer):
    tokens = tokenizer.encode(text)
    """Yield successive n-sized chunks from text."""
    i = 0
    while i < len(tokens):
        # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
        j = min(i + int(1.5 * n), len(tokens))
        while j > i + int(0.5 * n):
            # Decode the tokens and check for full stop or newline
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("\n"):
                break
            j -= 1
        # If no end of sentence found, use n tokens as the chunk size
        if j == i + int(0.5 * n):
            j = min(i + n, len(tokens))
        yield tokens[i:j]
        i = j


# Use text-davinci-003 with designed prompt to extract data
def extract_chunk_davinci(document, template_prompt):
    
    prompt=template_prompt.replace('<document>',document)

    response = openai.Completion.create(
        model='text-davinci-003', 
        prompt=prompt,
        temperature=0,
        max_tokens=1500,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return "1." + response['choices'][0]['text']


# Use gpt-3.5-turbo with designed prompt to extract data，加入重试机制
def extract_chunk(document, template_prompt):
    for i in range(3):  # Retry the API call up to 3 times
        try:
            prompt=template_prompt.replace('<document>', document)
            response = openai.ChatCompletion.create(
                model='gpt-3.5-turbo', 
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=1500,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0
            )
            return "1. " + response['choices'][0]['message']['content']
        except openai.error.RateLimitError:  # If rate limit is exceeded
            wait_time = (2 ** i) + random.random()  # Exponential backoff with jitter
            logging.warning(f"Rate limit exceeded. Retrying after {wait_time} seconds.")
            time.sleep(wait_time)  # Wait before retrying
        except Exception as e:  # If any other error occurs
            logging.error(f"API call failed: {str(e)}")
            return None  # Return None for failure
    logging.error("Failed to call OpenAI API after multiple retries due to rate limiting.")
    return None  # Return None for failure


# 调用API并使用重试机制处理rate limit error和其他异常
def get_completion(prompt, model="gpt-3.5-turbo"):
    for i in range(3):  # Retry the API call up to 3 times
        try:
            messages = [{"role": "user", "content": prompt}]
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                temperature=0,
            )
            return response.choices[0].message["content"]
        except openai.error.RateLimitError:  # If rate limit is exceeded
            wait_time = (2 ** i) + random.random()  # Exponential backoff with jitter
            logging.warning(f"Rate limit exceeded. Retrying after {wait_time} seconds.")
            time.sleep(wait_time)  # Wait before retrying
        except Exception as e:  # If any other error occurs
            logging.error(f"API call failed: {str(e)}")
            return None  # Return None for failure
    logging.error("Failed to call OpenAI API after multiple retries due to rate limiting.")
    return None  # Return None for failure

## 2. Export and Parse .pdf file

In [37]:
# Extract the raw text from each PDF using textract
text = textract.process('data/radiolarian/541.pdf', method='pdfminer').decode('utf-8')
clean_text = text.replace("  ", " ").replace("\n", "; ").replace(';',' ')

# Initialise tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

print(clean_text)
print("\n")
print(len(tokenizer.encode(clean_text)) / 1000)

Contents lists available at ScienceDirect     Palaeogeography, Palaeoclimatology, Palaeoecology     journal homepage: www.elsevier.com/locate/palaeo     Origin of Lower Carboniferous cherts in southern Guizhou, South China   Yutao Liang a,b, Xuan Tang a,b, Jinchuan Zhang a,b,*, Yang Liu a,b, Yu Zhang a,b, Kun Yuan a,c,   Tuo Lin c, Jianghui Ding d,e, Yang Wang f   a School of Energy Resources, China University of Geosciences (Beijing), Beijing 100083, China   b Key Laboratory of Strategy Evaluation for Shale Gas, Ministry of Natural Resources, Beijing 100083, China   c Oil and Gas Resources Survey, China Geological Survey, Beijing 100083, China   d CNPC Engineering Technology R & D Company Limited, Beijing 102206, China   e PetroChina Research Institute of Petroleum Exploration & Development, Beijing 100083, China   f Development Planning Office, China University of Petroleum (Beijing), Beijing 102249, China      A R T I C L E I N F O      Editor: Thomas Algeo     Keywords:   Microfoss

## 3. Sample Extraction Usage Functions

In [38]:
# Simple Entity Extraction
def simple_entity_extraction():
    # Example prompt - Simple Entity Extraction
    document = '<document>'
    template_prompt=f'''Extract key pieces of information from this regulation document.
    If a particular piece of information is not present, output \"Not specified\".
    When you extract a key piece of information, include the closest page number.
    Use the following format:\n0. Who is the author\n1. What is the amount of the "Power Unit Cost Cap" in USD, GBP and EUR\n2. What is the value of External Manufacturing Costs in USD\n3. What is the Capital Expenditure Limit in USD\n\nDocument: \"\"\"{document}\"\"\"\n\n0. Who is the author: Tom Anderson (Page 1)\n1.'''
    print(template_prompt)

    results = []
    chunks = create_chunks(clean_text, 1000, tokenizer)
    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    print("# Extracted data: ")
    for chunk in text_chunks:
        results.append(extract_chunk(chunk, template_prompt))
        # print(chunk)
        print(results[-1])

    groups = [r.split('\n') for r in results]
    # zip the groups together
    zipped = list(zip(*groups))
    zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
    print("# Grouped and Cleaned data: ")
    print(zipped)


# Complex Entity Extraction
def complex_entity_extraction():
    # Example prompt - Complex Entity Extraction
    document = '<document>'
    template_prompt=f'''Extract key pieces of information from this regulation document.
    If a particular piece of information is not present, output \"Not specified\".
    When you extract a key piece of information, include the closest page number.
    Use the following format:\n0. Who is the author\n1. How is a Minor Overspend Breach calculated\n2. How is a Major Overspend Breach calculated\n3. Which years do these financial regulations apply to\n\nDocument: \"\"\"{document}\"\"\"\n\n0. Who is the author: Tom Anderson (Page 1)\n1.'''
    print(template_prompt)

    results = []
    chunks = create_chunks(clean_text, 1000, tokenizer)
    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    print("# Extracted data: ")
    for chunk in text_chunks:
        results.append(extract_chunk(chunk, template_prompt))
        # print(chunk)
        print(results[-1])
        
    groups = [r.split('\n') for r in results]
    # zip the groups together
    zipped = list(zip(*groups))
    zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
    print("# Grouped and Cleaned data: ")
    print(zipped)


# Geological Data Extraction
def geo_data_extraction():
    # Example prompt - Geological Data Extraction
    document = '<document>'
    template_prompt=f'''Extract key pieces of information from this regulation document.
    If a particular piece of information is not present, output \"Not specified\".
    When you extract a key piece of information, include the closest page number.
    Use the following format:\n0. What is the title\n1. What is the percentage of karst in different continents\n2. What is the percentage of karst in different topographic settings\n3. What is the percentage of distribution of carbonate rocks in different climatic zones\n4. What is the absolute surface area of carbonate rocks in different continents\n\nDocument: \"\"\"{document}\"\"\"\n\n0. Who is the title: Global distribution of carbonate rocks and karst water resources (Page 1)\n1.'''
    print(template_prompt)

    results = []
    chunks = create_chunks(clean_text, 1000, tokenizer)
    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    print("# Extracted data: ")
    for chunk in text_chunks:
        results.append(extract_chunk(chunk, template_prompt))
        # print(chunk)
        print(results[-1])

    groups = [r.split('\n') for r in results]
    # zip the groups together
    zipped = list(zip(*groups))
    zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
    print("# Grouped and Cleaned data: ")
    print(zipped)

## 4. Extract Structured Data with Prompt 1 (generalized)

In [39]:
import ast
import json


field_list = ["section name", "location of the samples and sections", "GPS location", 
              "associated fossils", "lithology", "number of species and genera found"]
print("# field list: ")
print(field_list)

# a list format with <attribute>: <value(s)> per entry.
# What is the value of the "section name" attribute?
# What is the value of the "GPS Location" attribute?
# What is the value of the "title" attribute?

question_format = "0. What is the value of the 'title' attribute"
for idx, field in enumerate(field_list):
    new_question = str(idx+1) + ". What is the value of the '" + field + "' attribute"
    question_format = question_format + "\n" + new_question

# Example prompt - Geological Data Extraction
document = '<document>'
template_prompt=f'''Extract key pieces of information from this regulation document.
If a particular piece of information is not present, output \"Not specified\".
When you extract a key piece of information, include the closest page number.
---
Use the following format:
{question_format}
---
Document: \"\"\"{document}\"\"\"\n
0. What is the value of the 'title' attribute: Origin of Lower Carboniferous cherts in southern Guizhou, South China (Page 1)
1.'''


results = []
chunks = create_chunks(clean_text, 1000, tokenizer)
text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
# test usage: the first 3 chunks
# text_chunks = text_chunks[0: 3]
print("\n# text_chunk numbers:", len(text_chunks))  # 23 chunks for 541.pdf
print("\n# Extracting data...")
""" 可以在这里加入ThreadPoolExecutor引入多线程, 加快处理速度 """
for chunk in tqdm(text_chunks):
    results.append(extract_chunk(chunk, template_prompt))
    # print(results[-1])

groups = [r.split('\n') for r in results]
# zip the groups together
# zipped = list(zip(*groups))
# zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
# Modified: for incomplete response from gpt-turbo-3.5
groups = [y for x in groups for y in x]
groups = sorted(groups)
groups = [x for x in groups if "Not specified" not in x and "__" not in x]
zipped = groups

# remove too long answers (keep len(r) <= 180)
zipped = [r for r in zipped if len(r) <= 180]
print("\n# Grouped and Cleaned data: ")
for item in zipped:
    print(item)


# data_list = zipped
# prompt = f"""
# Translate the following python list to a dictionary: {data_list}
# """
# prompt = f"""
# Translate the following python list to a dictionary with "section name", "locations of the boulders, samples and sections", \
#     "gps location", "associated fossils", "lithology", "number of species or genera", and "number of new species or new genera" \
#     as the keys, and set the values as precisely and concisely as possible, and if value is "null", set as "[]": {data_list}
# """

zipped_example = ["1. What is the value of the 'section name' attribute: The end-Triassic extinction event (ETE) (Page 1)", "1. What is the value of the 'section name' attribute: Katsuyama section (Page 2)", "1. What is the value of the 'section name' attribute: The Inuyama area (Page 1)", "2. What is the value of the 'location of the samples and sections' attribute: Katsuyama section, Inuyama, Japan (Page 1)", "2. What is the value of the 'location of the samples and sections' attribute: Inuyama area, central Japan (Page 2)", "2. What is the value of the 'location of the samples and sections' attribute: Rock samples from TJ-3 to TJ + 4 (3 beds above TJ + 1) continuously (Page 2)", "3. What is the value of the 'GPS location' attribute: N 35◦25.367′, E 136◦58.261 (Page 2)", "4. What is the value of the 'associated fossils' attribute: Sea surface-dwelling radiolaria (Page 1)", "4. What is the value of the 'associated fossils' attribute: Radiolarian fossils (Page 1)", "4. What is the value of the 'associated fossils' attribute: Radiolarian fossils (Page 3)", "5. What is the value of the 'lithology' attribute: Bedded chert (Page 1)", "5. What is the value of the 'lithology' attribute: Bedded chert and siliciclastic rocks (Page 2)", "5. What is the value of the 'lithology' attribute: Siliceous mudstone, bedded chert sequence, and siliciclastic rocks (Page 1)"]
zipped_str_example = str(zipped_example)[1:][:-1]
field_list_example = ["section name", "location of the samples and sections", "GPS location", 
                      "associated fossils", "lithology", "number of species and genera found"]
zipped_str = str(zipped)[1:][:-1]
prompt = f'''You will read a paragraph, summarise it in JSON format according to keywords and remove duplicate values.
---
Here is an example: 

PARAGRAPH
{zipped_str_example}
KEYWORDS
{field_list_example}
OUTPUT
{{
    "section name": [
        "The end-Triassic extinction event (ETE)",
        "Katsuyama section",
        "The Inuyama area"
    ],
    "location of the samples and sections": [
        "Katsuyama section, Inuyama, Japan",
        "Inuyama area, central Japan",
        "Rock samples from TJ-3 to TJ + 4 (3 beds above TJ + 1) continuously"
    ],
    "GPS location": [
        "N 35◦25.367′, E 136◦58.261"
    ],
    "associated fossils": [
        "Sea surface-dwelling radiolaria",
        "Radiolarian fossils"
    ],
    "lithology": [
        "Bedded chert",
        "Bedded chert and siliciclastic rocks",
        "Siliceous mudstone, bedded chert sequence, and siliciclastic rocks"
    ],
    "number of species and genera found": []
}}
---
Here is the paragragh you need to process, summarise it in JSON format according to keywords and remove duplicate values: 

PARAGRAPH
{zipped_str}
KEYWORDS
{field_list}
OUTPUT

'''

response = get_completion(prompt)
print("\n# Transformed data: (type => string)")
print(response)

res_json = ast.literal_eval(response)
print("\n# Transformed JSON data: (type => JSON)")
print(json.dumps(res_json, indent=4))

with open('results/result.json', 'w', newline='\n') as file:
    json.dump(res_json, file, indent=4)

# field list: 
['section name', 'location of the samples and sections', 'GPS location', 'associated fossils', 'lithology', 'number of species and genera found']

# text_chunk numbers: 23

# Extracting data...


100%|██████████| 23/23 [02:49<00:00,  7.39s/it]



# Grouped and Cleaned data: 
1. What is the value of the 'section name' attribute: Dawuba Formation (Page 1)
1. What is the value of the 'section name' attribute: Foraminifera-bearing chert (Page 2)
1. What is the value of the 'section name' attribute: Geological setting (Page 1)
1. What is the value of the 'section name' attribute: Getuhe section, Luogang section (Page 6)
1. What is the value of the 'section name' attribute: Insights from silicon and oxygen isotope geochemistry (Page 15)
1. What is the value of the 'section name' attribute: Manchang, Getuhe, and Luogang sections (Page 1)
1. What is the value of the 'section name' attribute: Manchang, Luogang, and Getuhe sections (Page 1)
1. What is the value of the 'section name' attribute: Petrological characteristics (Page 4)
1. What is the value of the 'section name' attribute: Radiolarian-bearing chert (Page 6)
1. What is the value of the 'section name' attribute: The Dawuba Formation (Page 1)
2. What is the value of the 'locatio

In [40]:
groups = [r.split('\n') for r in results]
print(groups)
groups = [y for x in groups for y in x]
print(groups)
print(type(groups))
groups = sorted(groups)
print(groups)
groups = [x for x in groups if "Not specified" not in x and "__" not in x]
print(groups)
print(zipped)
# groups = [r.split('\n') for r in results]
# # zip the groups together
# zipped = list(zip(*groups))
# zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
# print(groups)
# results = results[1:]
# print(results[0])
# for item in results:
#     print(item)
# print(text_chunks[0])
# doc = "In this work, we study the use of the Bellman equation as a surrogate objective for value prediction accuracy. While the Bellman equation is uniquely solved by the true value function over all state-action pairs, we find that the Bellman error (the difference between both sides of the equation) is a poor proxy for the accuracy of the value function. In particular, we show that (1) due to cancellations from both sides of the Bellman equation, the magnitude of the Bellman error is only weakly related to the distance to the true value function, even when considering all stateaction pairs, and (2) in the finite data regime, the Bellman equation can be satisfied exactly by infinitely many suboptimal solutions. This means that the Bellman error can be minimized without improving the accuracy of the value function. We demonstrate these phenomena through a series of propositions, illustrative toy examples, and empirical analysis in standard benchmark domains."
# tmp = extract_chunk(text_chunks[0], template_prompt)
# tmp = extract_chunk(doc, template_prompt)
# print(tmp)

[["1. What is the value of the 'section name' attribute: Not specified", "2. What is the value of the 'location of the samples and sections' attribute: Lower Carboniferous Dawuba Formation in southern Guizhou (Page 1)", "3. What is the value of the 'GPS location' attribute: Not specified", "4. What is the value of the 'associated fossils' attribute: sponge spicules (monaxons and triaxon hexactines), radiolarians, and foraminifera (Endothyracea) (Page 1)", "5. What is the value of the 'lithology' attribute: Manchang spicule-bearing chert (MSC), Manchang foraminifera-bearing chert (MFC), Getuhe radiolarian-bearing chert (GRC), and Luogang foraminifera-bearing chert (LFC) (Page 1)", "6. What is the value of the 'number of species and genera found' attribute: Not specified"], ["1. What is the value of the 'section name' attribute: Geological setting (Page 1)", "2. What is the value of the 'location of the samples and sections' attribute: Manchang section (25.69847◦N, 106.08623◦E), Getuhe s

## 4. Extract Structured Data with Prompt 1

In [41]:
# import ast
# import json

# # Example prompt - Geological Data Extraction
# document = '<document>'
# template_prompt=f'''Extract key pieces of information from this regulation document.
# If a particular piece of information is not present, output \"Not specified\".
# When you extract a key piece of information, include the closest page number.
# ---
# Use the following format:
# 0. What is the title
# 1. What is the section name
# 2. What are the locations of the boulders, samples and sections
# 3. What is the gps location
# 4. What are the associated fossils
# 5. What is the lithology
# 6. What is the number of species or genera found
# 7. What is the number of new species or new genera found
# ---
# Document: \"\"\"{document}\"\"\"\n
# 0. Who is the title: Origin of Lower Carboniferous cherts in southern Guizhou, South China (Page 1)
# 1.'''

# results = []
# chunks = create_chunks(clean_text, 1000, tokenizer)
# text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
# # test usage: the first 3 chunks
# text_chunks = text_chunks[0: 3]
# print("\n# text_chunk numbers:", len(text_chunks))  # 23 chunks for 541.pdf
# print("\n# Extracted data: ")
# for chunk in text_chunks:
#     results.append(extract_chunk(chunk, template_prompt))
#     # print(results[-1])

# groups = [r.split('\n') for r in results]
# # zip the groups together
# zipped = list(zip(*groups))
# zipped = [x for y in zipped for x in y if "Not specified" not in x and "__" not in x]
# print("\n# Grouped and Cleaned data: ")
# for item in zipped:
#     print(item)


# data_list = zipped
# # prompt = f"""
# # Translate the following python list to a dictionary: {data_list}
# # """
# prompt = f"""
# Translate the following python list to a dictionary with "section name", "locations of the boulders, samples and sections", \
#     "gps location", "associated fossils", "lithology", "number of species or genera", and "number of new species or new genera" \
#     as the keys, and set the values as precisely and concisely as possible, and if value is "null", set as "[]": {data_list}
# """
# response = get_completion(prompt)
# print("\n# Transformed data: ")
# print(response)
# res_json = ast.literal_eval(response)
# print("\n# Transformed JSON data: ")
# print(json.dumps(res_json, indent=4))

# with open('results/715.json', 'w', newline='\n') as file:
#     json.dump(res_json, file, indent=4)

## 5. Extract Structured Data with Prompt 2

In [42]:
# import ast
# import json

# # Example prompt - Geological Data Extraction (segment 2)
# document = '<document>'
# template_prompt = f"""Sample text:
# <tr class="mergedrow"><th scope="row" class="infobox-label"><div style="text-indent:-0.9em;margin-left:1.2em;font-weight:normal;">•&nbsp;<a href="/wiki/Monarchy_of_Canada" title="Monarchy of Canada">Monarch</a> </div></th><td class="infobox-data"><a href="/wiki/Charles_III" title="Charles III">Charles III</a></td></tr>
# <tr class="mergedrow"><th scope="row" class="infobox-label"><div style="text-indent:-0.9em;margin-left:1.2em;font-weight:normal;">•&nbsp;<span class="nowrap"><a href="/wiki/Governor_General_of_Canada" title="Governor General of Canada">Governor General</a></span> </div></th><td class="infobox-data"><a href="/wiki/Mary_Simon" title="Mary Simon">Mary Simon</a></td></tr>
# <b>Provinces and Territories</b class='navlinking countries'>
# <ul>
# <li>Saskatchewan</li>
# <li>Manitoba</li>
# <li>Ontario</li>
# <li>Quebec</li>
# <li>New Brunswick</li>
# <li>Prince Edward Island</li>
# <li>Nova Scotia</li>
# <li>Newfoundland and Labrador</li>
# <li>Yukon</li>
# <li>Nunavut</li>
# <li>Northwest Territories</li>
# </ul>

# Question: List all relevant attributes about 'Canada' that are exactly mentioned in this sample text if any.
# Answer: 
# - Monarch: Charles III
# - Governor General: Mary Simon
# - Provinces and Territories: Saskatchewan, Manitoba, Ontario, Quebec, New Brunswick, Prince Edward Island, Nova Scotia, Newfoundland and Labrador, Yukon, Nunavut, Northwest Territories

# ----

# Sample text:
# Patient birth date: 1990-01-01
# Prescribed medication: aspirin, ibuprofen, acetaminophen
# Prescribed dosage: 1 tablet, 2 tablets, 3 tablets
# Doctor's name: Dr. Burns
# Date of discharge: 2020-01-01
# Hospital address: 123 Main Street, New York, NY 10001

# Question: List all relevant attributes about 'medications' that are exactly mentioned in this sample text if any.
# Answer: 
# - Prescribed medication: aspirin, ibuprofen, acetaminophen
# - Prescribed dosage: 1 tablet, 2 tablets, 3 tablets

# ----

# Sample text:
# {document}

# Question: List all relevant attributes about 'radiolarians' that are exactly mentioned in this sample text if any. 
# Answer:"""

# results = []
# chunks = create_chunks(clean_text, 1000, tokenizer)
# text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
# # test usage: the first 5 chunks
# # text_chunks = text_chunks[0: 5]
# print("\n# text_chunk numbers:", len(text_chunks))  # 23 chunks for 541.pdf
# print("\n# Extracted data: ")
# for chunk in text_chunks:
#     results.append(extract_chunk(chunk, template_prompt))
#     print(results[-1])

# groups = [r.split('\n') for r in results]
# groups = [y for x in groups for y in x]
# print("\n# Grouped and Cleaned data: ")
# for item in groups:
#     print(item)


# data_list = groups
# prompt = f"""
# Translate the following python list to a dictionary with "section name", "locations of the boulders, samples and sections", \
#     "gps location", "associated fossils", "lithology", "number of species or genera", and "number of new species or new genera" \
#     as the keys, and set the values as precisely and concisely as possible, and if value is "null", set as "[]": {data_list}
# """
# response = get_completion(prompt)
# res_json_1 = ast.literal_eval(response)
# print("\n# Transformed JSON data: ")
# print(json.dumps(res_json_1, indent=4))

# with open('results/361_1.json', 'w', newline='\n') as file:
#     json.dump(res_json_1, file, indent=4)