## Long Short Distance Extraction from Unstructured Text

This experiment uses LLMs to extract distance related information from unstructured climate mobility literature.

### Libraries

In [1]:
from openai import OpenAI, RateLimitError, APIError, APITimeoutError, AuthenticationError, BadRequestError, NotFoundError
from google.api_core.exceptions import ResourceExhausted, RetryError, DeadlineExceeded
import google.generativeai as genai

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# from multiprocessing import Pool
import time
import os
import ast
from tqdm import tqdm

### Layout

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_seq_items', None)

### Data

In [18]:
literature = np.load('literature.npy', allow_pickle=True).item()
shot = np.load('example.npy', allow_pickle=True).item()

### Pre-defined variables

In [4]:
MAX_TOKEN = 16

### General commands for getting response

In [43]:
def api_def(provider):
    # API keys are hidden in another file
    with open('api.txt', 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file.readlines()]

        # Different LLMs need different kinds of command
        if provider == 'chatgpt':
            client = OpenAI(api_key=str(lines[0]), base_url="https://api.openai.com/v1")
            return client
        
        elif provider == 'deepseek':
            client = OpenAI(api_key=str(lines[1]), base_url="https://api.deepseek.com")
            return client
        
        elif provider == 'gemini':
            # My project with api in line 2 exceeds usage limit.
            genai.configure(api_key=str(lines[3]))

In [6]:
def get_completion(prompt, provider, model, temperature):
    messages = [{"role": "system", "content": "You are an expert in Human Geography and are reviewing papers. \
    Your task is to read a piece of provided text and simply classify it according to the prompt. Please answer the question without deep thinking"},
               {"role": "user", "content": prompt}]

    # Different LLMs need different kinds of command
    if provider == 'gpt':
        client = api_def('chatgpt')
        response = client.responses.create(
            model=model,
            input=messages,
            temperature=temperature,
            max_output_tokens = MAX_TOKEN
        )
        return response.output_text
        
    elif provider == 'ds':
        client = api_def('deepseek')
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens = MAX_TOKEN
        )
        return response.choices[0].message.content

    elif provider == 'gemini':
        api_def('gemini')
        model_gemini = genai.GenerativeModel(model)
        response = model_gemini.generate_content(
            prompt,
            generation_config={"temperature": temperature, "max_output_tokens": 100}
        )
        return response.text        

In [9]:
CONTEXT = f"""
We are reviewing a series of papers regarding whether they discuss 'long distance human migration', 'short distance human migration', or both of them, \
or even none of them. For each paper, give only one character as result: If it discusses 'long distance human migration', return '1'; If it discusses \
'short distance human migration', please return '2'; If it discusses both 'long distance human migration' and 'long distance human migration', return '3', \
while if it discusses nothing about distance, please return '0'. \n

While classifying a paper, consider only the content of this exact paper. Do not use information from elsewhere. Do not infer missing information from \
past tasks or papers. Focus primarily on the abstract, data/methods, results, and conclusions sections. \
Do not rely on introduction, state-of-the-art, related works, or motivation sections, especially when they describe other researchers’ work. \
Only use information from those sections if they explicitly describe what this paper does. \
Do not output anything else such as explanations, opinions or meta-comments. 
"""

In [21]:
def get_response(paper: dict, example, provider, model, temperature):

    fail_count = 0
    while True:
        if example == 3:
            prompt = f"""
            {CONTEXT} \n
            <Group leader>: Please assign a character for the following paper: \n
            \"\"\"{shot[0]}\"\"\"
            \n
            <Expert>: My answer is: 1 \n
            <Group leader>: Please assign a character for another paper: \n
            \"\"\"{shot[1]}\"\"\"
            \n
            <Expert>: My answer is: 3 \n
            <Group leader>: Please assign a character for another paper: \n
            \"\"\"{shot[2]}\"\"\"
            \n
            <Expert>: My answer is: 2 \n
            
            <Group leader>: Here is a new paper, please classify it like the expert: \n"
            f"{paper}\n"
            "Keep in mind that output is only one character."
            """

        elif example == 1:
            prompt = f"""
            {CONTEXT} \n
            <Group leader>: Please assign a character for the following paper: \n
            \"\"\"{shot[0]}\"\"\"
            \n
            <Expert>: My answer is: 1 \n
            
            <Group leader>: Here is a new paper, please classify it like the expert: \n"
            f"{paper}\n"
            "Keep in mind that output is only one character."
            """

        elif example == 0:
            prompt = f"""
            {CONTEXT} \n
            <Group leader>: Please assign a character for the following paper according to the requirement: \n
            f"{paper}\n"
            "Keep in mind that output is only one character."
            """

        response = get_completion(prompt, provider, model, temperature)
        return response

### Organization

In [44]:
def QA_to_df(literature, example, provider, model, idx):
    results = []
    fail_count = 0
    temperatures = np.round(np.arange(0.0, 2.0, 0.4), 1)
        
    for t in tqdm(temperatures, desc=f"File {idx+1}", leave=True):        
        for run_id in range(10):
            success = False
            while not success:
                try:
                    result = {"answer": get_response(literature, example, provider, model, t)}
                    result["provider"] = provider
                    result["model"] = model
                    result["few shot"] = example
                    result["temperature"] = t
                    result["run"] = run_id
                    results.append(result)
                    time.sleep(1)
                    success = True
                except (RateLimitError, APIError, APITimeoutError, ResourceExhausted) as e: # AuthenticationError, BadRequestError, NotFoundError
                    time.sleep(1)
                    fail_count = fail_count + 1
                    if fail_count >= 10:
                        raise ValueError(f"Can't get response, retried too many times") from e

    df = pd.DataFrame(results)
    return df

In [15]:
def batch_extract_to_df(all_literature, example, provider, model):

    save_path = f"results_s/{provider}_{example}.csv"
    os.makedirs("results_s", exist_ok=True)

    if os.path.exists(save_path):
        temporal_df = pd.read_csv(save_path)
        processed_ids = set(temporal_df["file_id"].astype(int).tolist())
        print(f"Start from paper {len(processed_ids)+1}")
    else:
        temporal_df = pd.DataFrame()
        processed_ids = set()
        
    all_results = [temporal_df] if not temporal_df.empty else []
    
    for idx, literature in enumerate(tqdm(all_literature.values(), desc="Progress", leave=False)):
        file_id = idx + 1
        if file_id in processed_ids:
            continue
        
        df = QA_to_df(literature, example, provider, model, idx)
        df["file_id"] = file_id
        all_results.append(df)

        temporal_df = pd.concat(all_results, ignore_index=True)
        temporal_df.astype(str).to_csv(save_path, index=False)

    final_df = pd.concat(all_results, ignore_index=True)
    return final_df

30 documents, 3 models, 3 prompt types, 5 temperatures, 10 runs, in sum 13500 api calls.

### Results

In [16]:
gpt41_0 = batch_extract_to_df(literature, 0, 'gpt', 'gpt-4.1-mini')

Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 1:   0%|                                                                                    | 0/5 [00:00<?, ?it/s]
[Ae 1:  20%|███████████████▏                                                            | 1/5 [00:21<01:26, 21.67s/it]
[Ae 1:  40%|██████████████████████████████▍                                             | 2/5 [00:36<00:53, 17.81s/it]
[Ae 1:  60%|█████████████████████████████████████████████▌                              | 3/5 [00:45<00:27, 13.72s/it]
[Ae 1:  80%|████████████████████████████████████████████████████████████▊               | 4/5 [00:53<00:11, 11.38s/it]
File 1: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [01:01<00:00, 12.38s/it]
Progress:   3%|██▍                                                                      | 1/30 [01:01<29:56, 61.94s/it]
[Ae 2:   0%|                           

In [20]:
gpt41_1 = batch_extract_to_df(literature, 1, 'gpt', 'gpt-4.1-mini')

Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 1:   0%|                                                                                    | 0/5 [00:00<?, ?it/s]
[Ae 1:  20%|███████████████▏                                                            | 1/5 [00:11<00:47, 11.99s/it]
[Ae 1:  40%|██████████████████████████████▍                                             | 2/5 [00:24<00:36, 12.04s/it]
[Ae 1:  60%|█████████████████████████████████████████████▌                              | 3/5 [00:34<00:23, 11.52s/it]
[Ae 1:  80%|████████████████████████████████████████████████████████████▊               | 4/5 [00:44<00:10, 10.91s/it]
File 1: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [00:56<00:00, 11.38s/it]
Progress:   3%|██▍                                                                      | 1/30 [00:56<27:30, 56.90s/it]
[Ae 2:   0%|                           

In [22]:
gpt41_3 = batch_extract_to_df(literature, 3, 'gpt', 'gpt-4.1-mini')

Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 1:   0%|                                                                                    | 0/5 [00:00<?, ?it/s]
[Ae 1:  20%|███████████████▏                                                            | 1/5 [00:21<01:26, 21.59s/it]
[Ae 1:  40%|██████████████████████████████▍                                             | 2/5 [00:38<00:55, 18.54s/it]
[Ae 1:  60%|█████████████████████████████████████████████▌                              | 3/5 [00:58<00:39, 19.62s/it]
[Ae 1:  80%|████████████████████████████████████████████████████████████▊               | 4/5 [01:14<00:18, 18.13s/it]
File 1: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [01:29<00:00, 17.82s/it]
Progress:   3%|██▍                                                                      | 1/30 [01:29<43:05, 89.15s/it]
[Ae 2:   0%|                           

In [23]:
ds32_0 = batch_extract_to_df(literature, 0, 'ds', 'deepseek-chat')

Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 1:   0%|                                                                                    | 0/5 [00:00<?, ?it/s]
[Ae 1:  20%|███████████████▏                                                            | 1/5 [00:17<01:10, 17.72s/it]
[Ae 1:  40%|██████████████████████████████▍                                             | 2/5 [00:34<00:51, 17.13s/it]
[Ae 1:  60%|█████████████████████████████████████████████▌                              | 3/5 [00:51<00:34, 17.11s/it]
[Ae 1:  80%|████████████████████████████████████████████████████████████▊               | 4/5 [01:08<00:17, 17.01s/it]
File 1: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [01:25<00:00, 17.07s/it]
Progress:   3%|██▍                                                                      | 1/30 [01:25<41:15, 85.37s/it]
[Ae 2:   0%|                           

In [32]:
ds32_1 = batch_extract_to_df(literature, 1, 'ds', 'deepseek-chat')

Start from paper 16


Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 16:   0%|                                                                                   | 0/5 [00:00<?, ?it/s]
[Ae 16:  20%|███████████████                                                            | 1/5 [00:18<01:14, 18.62s/it]
[Ae 16:  40%|██████████████████████████████                                             | 2/5 [00:36<00:55, 18.34s/it]
[Ae 16:  60%|█████████████████████████████████████████████                              | 3/5 [00:54<00:36, 18.27s/it]
[Ae 16:  80%|████████████████████████████████████████████████████████████               | 4/5 [01:12<00:17, 17.80s/it]
File 16: 100%|███████████████████████████████████████████████████████████████████████████| 5/5 [01:30<00:00, 18.06s/it]
Progress:  53%|██████████████████████████████████████▍                                 | 16/30 [01:30<01:19,  5.65s/it]
[Ae 17:   0%|                          

In [28]:
ds32_3 = batch_extract_to_df(literature, 3, 'ds', 'deepseek-chat')

Start from paper 27


Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 27:   0%|                                                                                   | 0/5 [00:00<?, ?it/s]
[Ae 27:  20%|███████████████                                                            | 1/5 [00:22<01:29, 22.46s/it]
[Ae 27:  40%|██████████████████████████████                                             | 2/5 [00:42<01:03, 21.10s/it]
[Ae 27:  60%|█████████████████████████████████████████████                              | 3/5 [01:04<00:42, 21.35s/it]
[Ae 27:  80%|████████████████████████████████████████████████████████████               | 4/5 [01:26<00:21, 21.50s/it]
File 27: 100%|███████████████████████████████████████████████████████████████████████████| 5/5 [01:46<00:00, 21.36s/it]
Progress:  90%|████████████████████████████████████████████████████████████████▊       | 27/30 [01:46<00:11,  3.96s/it]
[Ae 28:   0%|                          

In [29]:
gemini25_0 = batch_extract_to_df(literature, 0, 'gemini', 'gemini-2.5-flash')

Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 1:   0%|                                                                                    | 0/5 [00:00<?, ?it/s]
[Ae 1:  20%|███████████████▏                                                            | 1/5 [00:13<00:55, 14.00s/it]
[Ae 1:  40%|██████████████████████████████▍                                             | 2/5 [00:30<00:46, 15.58s/it]
[Ae 1:  60%|█████████████████████████████████████████████▌                              | 3/5 [00:43<00:28, 14.34s/it]
[Ae 1:  80%|████████████████████████████████████████████████████████████▊               | 4/5 [00:58<00:14, 14.73s/it]
File 1: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [01:11<00:00, 14.25s/it]
Progress:   3%|██▍                                                                      | 1/30 [01:11<34:27, 71.28s/it]
[Ae 2:   0%|                           

In [30]:
gemini25_1 = batch_extract_to_df(literature, 1, 'gemini', 'gemini-2.5-flash')

Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 1:   0%|                                                                                    | 0/5 [00:00<?, ?it/s]
[Ae 1:  20%|███████████████▏                                                            | 1/5 [00:21<01:26, 21.54s/it]
[Ae 1:  40%|██████████████████████████████▍                                             | 2/5 [00:35<00:51, 17.31s/it]
[Ae 1:  60%|█████████████████████████████████████████████▌                              | 3/5 [00:51<00:32, 16.49s/it]
[Ae 1:  80%|████████████████████████████████████████████████████████████▊               | 4/5 [01:09<00:16, 16.94s/it]
File 1: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [01:23<00:00, 16.72s/it]
Progress:   3%|██▍                                                                      | 1/30 [01:23<40:24, 83.61s/it]
[Ae 2:   0%|                           

In [45]:
gemini25_3 = batch_extract_to_df(literature, 3, 'gemini', 'gemini-2.5-flash')

Progress:   0%|                                                                                 | 0/30 [00:00<?, ?it/s]
[Ae 1:   0%|                                                                                    | 0/5 [00:00<?, ?it/s]
[Ae 1:  20%|███████████████▏                                                            | 1/5 [00:31<02:05, 31.36s/it]
[Ae 1:  40%|██████████████████████████████▍                                             | 2/5 [00:58<01:26, 28.95s/it]
[Ae 1:  60%|█████████████████████████████████████████████▌                              | 3/5 [01:32<01:02, 31.31s/it]
[Ae 1:  80%|████████████████████████████████████████████████████████████▊               | 4/5 [01:59<00:29, 29.54s/it]
File 1: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [02:27<00:00, 29.60s/it]
Progress:   3%|██▎                                                                   | 1/30 [02:28<1:11:33, 148.04s/it]
[Ae 2:   0%|                           