In [10]:
import openai
import numpy as np
import os
from typing import Dict, List
import json
import pandas as pd

In [2]:
openai.api_key=""

In [12]:
literature = np.load('literature.npy', allow_pickle=True).item()
example = np.load('example.npy', allow_pickle=True).item()

In [43]:
def get_completion(prompt, temperature_new, model='gpt-4o-mini'):
    messages = [{"role": "system", "content": "You are a research assistant and are responsible to extract structured knowledge from climate mobility literature"},
               {"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature_new,
    )
    return response.choices[0].message.content

In [45]:
EXAMPLE_PROMPT = f"""

<Prof. Dr. Patrick Sakdapolrak>: Please extract information from this peer-reviewed scientific paper of climate mobility discipline: 
\"\"\"{example}\"\"\"
according to the following instructions:

First, read and understand the context carefully, especially its abstract, methods, results and conclusions. \
Second, summarize the paper with the following indices:
a. Methodology used by researchers for this study, use 1 to represent qualitative data analysis, 2 for quantitative data analysis, and 3 for mixed methods.
b. Focal group of people of the study (with different scales), 1 means it focused on individuals, 2 means it focused on households, 3 means it focused on \
subnations, 4 means it focused on nations, 5 means it focused on international or global scale, 6 means it considerd mixed types of groups.
c. Research areas (Here it denotes the 'origins' of those migrants, which is also the areas being influenced by climatic events and other factors), \
1 is urban areas, 2 is rural areas, 3 means mixed areas.
d. Factors influencing human mobility discussed in this paper, 1 denotes all socio-demographic-economic factors, 2 denotes environmental factors \
3 means both.
e. Does the research consider time frame, 0 means no, 1 means yes, 2: means it contain the prediction or projection of the future.
f: Does the research consider the difference between genders, 0 means no, 1 means yes.
g: Human migration type considered in the paper, 1 means internal, while 2 means international, and 3 means both.
h: Sample size (i.e. a follow-up of index b, means how many samples did it consider, no matter which scale it chose).
i: Focused countries (same as c, consider the 'origins' of those migrants, which is also the areas being influenced by climatic events and other factors) \
Here please use string rather than numeric codes or numbers as a to h.

Keep in mind that sometimes papers do have sections called 'introduction', 'state of the art', 'related works', 'motivation', or other \
semantically similar phrases. Be careful with these sections since the authors may introduce other research there. You should not wrongly consider \
those related research for summarizing these indices.

Finally please make your result as a dictionary in Python. Give each entry a UUID from 0. 

<research assistant> OK, here is my result:
{{"uuid": "0", "method": 3, "focal_scale": 1, "geographic_area": 2, "factors": 3, "time_frame": 0, "gender": 0, "migration_type": 1, "sample_size": 203, "focused_countries": "Ghana"}}
"""

In [41]:
def extract_features(paper: dict) -> dict:

    prompt = f"""
    Your task is to answer in a consistent style. Please perform the following actions: 
    1 - Read the example below, understand what Patrick needs and how the research assistant extracts information from the sample paper according to \
    Patrick's requirements.
    2 - You will be provided with a new paper. Summarize the new paper you receive and give your answer in the same way as the research assistant.
    You just need to return the final result of action 2, and make it as JSON format.

    Here is the example: \n"
    f"{EXAMPLE_PROMPT}\n"
    Here is the new paper, please extract the information like the research assistant: \n"
    f"{paper}\n"
    "Keep in mind that output format is JSON"
    """

    response = get_completion(prompt, 0.3)
    
    if response.strip().startswith("```"):
        lines = response.splitlines()
        if lines and lines[0].startswith("```"):
            lines = lines[1:]
        if lines and lines[-1].startswith("```"):
            lines = lines[:-1]
        response = "\n".join(lines)
    response = response.strip()
    #print(response)
    
    try:
        data = json.loads(response)
    except json.JSONDecodeError:
        raise ValueError(f"Can't make it as JSON file: {response}")

    return data

In [24]:
def extract_knowledge(literature: dict):
    results = []

    for pid, sections in literature.items():
        features = extract_features(sections)
        features["id"] = pid
        results.append(features)

    df = pd.DataFrame(results)
    df = df[["id", "method", "focal_scale", "geographic_area", "factors", "time_frame", "gender", "migration_type", "sample_size", "focused_countries"]]

    output_path = "extracted_knowledge.csv"
    df.to_csv(output_path, index=False)

In [46]:
if __name__ == "__main__":
    extract_knowledge(literature)