In [1]:
import openai
import numpy as np
import os
from typing import Dict, List
import json
import pandas as pd

In [2]:
openai.api_key=""

In [3]:
literature = np.load('literature.npy', allow_pickle=True).item()
example = np.load('example.npy', allow_pickle=True).item()

In [5]:
def get_completion(prompt, temperature_new, model='gpt-4o-mini'):
    messages = [{"role": "system", "content": "You are a research assistant and are responsible to extract structured knowledge from climate mobility literature"},
               {"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature_new,
    )
    return response.choices[0].message.content

In [8]:
EXAMPLE_PROMPT = f"""

<Prof. Dr. Patrick Sakdapolrak>: Please extract triples from this peer-reviewed scientific paper of climate mobility: 
\"\"\"{example}\"\"\"
according to the following instructions:

First, read and understand the context carefully, especially its abstract, methods, results and conclusions. 
Second, extract knowledge related to climate mobility as triples embedded in the text. In detail, a triple should look like <head, relation, tail>. \
Head and tails are entities (like 'elderly', 'poor people', etc.) or events (like 'flooding', 'heating', 'conflicts', etc.) related to climate and \
human mobility as well as imtermediate socio-demo-economic factors. 

Keep in mind that sometimes papers do have sections called 'introduction', 'state of the art', 'related works', 'motivation', or other \
semantically similar phrases. (Section titles are reflected in the dictionary as keys.) Be careful with these sections since the authors may introduce \
other research there. You should not wrongly consider those related research for summarizing these indices. Also distinguish results of the paper \
with hypotheses, and descriptions of circumstance etc., the latters should also not be counted.

Please make your result as a list of dictionary in Python. 

<research assistant> OK, here is my result:
[{{"heading": "migrate", "relationship": "from", "tail": "northern Ghana"}},
{{"heading": "migration", "relationship": "for dealing with", "tail": "structural environmental scarcity"}},
{{"heading": "migrate", "relationship": "to", "tail": "the moister forest and coastal zones"}},
{{"heading": "migration", "relationship": "is because of", "tail": "less rainfall"}},
{{"heading": "migration", "relationship": "is related to", "tail": "the amount of vegetation"}},
{{"heading": "densely populated districts", "relationship": "have", "tail": "higher migration rates"}},
{{"heading": "migration propensities", "relationship": "are higher in", "tail": "districts with more natural resource scarcity"}},
{{"heading": "migration", "relationship": "does not increase", "tail": "in times of environmental stress in the source areas of out-migration"}},
{{"heading": "migration", "relationship": "is not because of", "tail": "degradation"}}]
"""

In [7]:
def extract_features(paper: dict) -> dict:

    prompt = f"""
    Your task is to answer in a consistent style. Please perform the following actions: 
    1 - Read the example below, understand what Patrick needs and how the research assistant extracts information from the sample paper according to \
    Patrick's requirements into triples.
    2 - You will be provided with a new paper. Summarize the new paper you receive and give your answer in the same way as the research assistant.
    You just need to return the final result of action 2, and make it as JSON list.

    Here is the example: \n"
    f"{EXAMPLE_PROMPT}\n"
    Here is the new paper, please extract the information like the research assistant: \n"
    f"{paper}\n"
    "Again, keep in mind that output format is JSON list"
    """

    response = get_completion(prompt, 0.3)
    
    if response.strip().startswith("```"):
        lines = response.splitlines()
        if lines and lines[0].startswith("```"):
            lines = lines[1:]
        if lines and lines[-1].startswith("```"):
            lines = lines[:-1]
        response = "\n".join(lines)
    response = response.strip()
    print(response)
    
    try:
        data = json.loads(response)
    except json.JSONDecodeError:
        raise ValueError(f"Can't make it as JSON file: {response}")

    return data

In [9]:
def extract_knowledge(literature: dict):
    results = []

    for pid, sections in literature.items():
        features = extract_features(sections)
        for idx, tri in enumerate(features):
            results.append({
                "uuid": idx,
                "paper_id": pid,
                "heading": tri.get("heading"),
                "relationship": tri.get("relationship"),
                "tail": tri.get("tail")
            })

    df = pd.DataFrame(results)
    df = df[["uuid", "paper_id", "heading", "relationship", "tail"]]

    output_path = "extracted_triples.csv"
    df.to_csv(output_path, index=False)

In [10]:
if __name__ == "__main__":
    extract_knowledge(literature)

[
    {"heading": "climatic change", "relationship": "has played a role in", "tail": "urbanization in sub-Saharan Africa"},
    {"heading": "urbanization", "relationship": "is influenced by", "tail": "rainfall"},
    {"heading": "urbanization in sub-Saharan Africa", "relationship": "has increased due to", "tail": "shortages in rainfall"},
    {"heading": "urbanization", "relationship": "is driven by", "tail": "internal migration"},
    {"heading": "rural-urban migration", "relationship": "accounts for", "tail": "half of urban growth in Africa"},
    {"heading": "climate change", "relationship": "affects", "tail": "urbanization patterns"},
    {"heading": "rainfall", "relationship": "has declined since", "tail": "the 1960s"},
    {"heading": "urbanization rate", "relationship": "is higher in", "tail": "sub-Saharan Africa compared to other developing countries"},
    {"heading": "decolonization", "relationship": "reinforced", "tail": "the link between rainfall and urbanization"},
    {"h