## LLM Long-/Short- Distance Conceptualization Part I

This experiment tests LLMs' conceptualization on distance considering different scenarios.

### Libraries

In [65]:
from openai import OpenAI, RateLimitError, APIError, APITimeoutError, AuthenticationError, BadRequestError, NotFoundError
from google.api_core.exceptions import ResourceExhausted, RetryError, DeadlineExceeded
import google.generativeai as genai

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import time
import os
import ast
from tqdm import tqdm
import requests

### Layout

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_seq_items', None)

### Pre-defined variables

Anything you want to set as prior, such as max token, temperature, top-p, logprob, etc.

In [18]:
MAX_TOKEN = 16
TEMPERATURE = 0.7

### General commands for getting response

We employ 3 different models: gpt-5.1, gemini-2.5-flash, and deepseek-3.2-chat via API provided by them. For all models we consider the same temperature (0.7) and run each experiment for 30 times to get valid threshold.

In [62]:
def api_def(provider):
    # API keys are hidden in another file
    with open('api.txt', 'r', encoding='utf-8') as file:
        lines = [line.strip() for line in file.readlines()]

        # Different LLMs need different kinds of command
        if provider == 'chatgpt':
            client = OpenAI(api_key=str(lines[0]), base_url="https://api.openai.com/v1")
            return client
        
        elif provider == 'deepseek':
            client = OpenAI(api_key=str(lines[1]), base_url="https://api.deepseek.com")
            return client
        
        elif provider == 'gemini':
            genai.configure(api_key=str(lines[4]))

In [20]:
def get_completion(prompt, provider, model):
    messages = [{"role": "system", "content": "You are participating in our social survey. Please simply answer the question without deep thinking"},
               {"role": "user", "content": prompt}]

    # Different LLMs need different kinds of command
    if provider == 'gpt':
        client = api_def('chatgpt')
        response = client.responses.create(
            model=model,
            input=messages,
            temperature=TEMPERATURE,
            max_output_tokens = MAX_TOKEN
            # reasoning_effort = high
        )
        return response.output_text
        
    elif provider == 'ds':
        client = api_def('deepseek')
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=TEMPERATURE
        )
        return response.choices[0].message.content

    elif provider == 'gemini':
        api_def('gemini')
        model_gemini = genai.GenerativeModel(model)
        response = model_gemini.generate_content(
            prompt,
            generation_config={"temperature": TEMPERATURE, "max_output_tokens": 100}
        )
        return response.text 

In [21]:
def QA_1(provider, model, dist):
    prompt = f"""
    Please determine if {dist} kilometers qualifies as a "long distance" or a 'short distance'. \
    Output only one character: If it qualifies as a 'long distance', please return 'Y', while if it qualifies as \
    a 'short distance', return 'N'. We don't have any other category in this scheme like "medium distance". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments. 
    """
    response = get_completion(prompt, provider, model)
    return response

In [22]:
def QA_2(provider, model, dist):
    prompt = f"""
    Please determine if a {dist}-kilometer commute qualifies as a 'long-distance commute' or a 'short-distance commute'. \
    Output only one character: If it qualifies as a 'long-distance commute', please return 'Y', while if it \
    qualifies as a 'short-distance commute', please return 'N'. We don't have any other category in this scheme like "medium-distance commute". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments.
    """
    response = get_completion(prompt, provider, model)
    return response

In [23]:
def QA_3(provider, model, dist):
    prompt = f"""
    Please determine if a {dist}-kilometer human migration qualifies as a 'long-distance human migration' \
    or a 'short-distance human migration'. \
    Output only one character: If it qualifies as a 'long-distance human migration', please return 'Y', \
    while if it qualifies as a 'short-distance human migration', please return 'N'. \
    We don't have any other category in this scheme like "medium-distance human migration". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments.
    """
    response = get_completion(prompt, provider, model)
    return response

In [24]:
def QA_4(provider, model, dist):
    prompt = f"""
    Please determine if a {dist}-kilometer human migration under the influence of environmental or climate change \
    qualifies as a 'long-distance human migration' or a 'short-distance human migration'. \
    Output only one character: If it qualifies as a 'long-distance human migration', please return 'Y', \
    while if it qualifies as a 'short-distance human migration', please return 'N'. \
    We don't have any other category in this scheme like "medium-distance human migration". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments.
    """
    response = get_completion(prompt, provider, model)
    return response

In [25]:
def QA_5(provider, model, dist):
    prompt = f"""
    Please determine if a {dist}-kilometer seasonal human migration under the influence of environmental or \
    climate change qualifies as a 'long-distance human migration' or a 'short-distance human migration'. \
    Output only one character: If it qualifies as a 'long-distance human migration', please return 'Y', \
    while if it qualifies as a 'short-distance human migration', please return 'N'. \
    We don't have any other category in this scheme like "medium-distance human migration". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments.
    """
    response = get_completion(prompt, provider, model)
    return response

In [26]:
def QA_6(provider, model, dist):
    prompt = f"""
    Please determine if a {dist}-kilometer temporal human migration under the influence of environmental or \
    climate change qualifies as a 'long-distance human migration' or a 'short-distance human migration'. \
    Output only one character: If it qualifies as a 'long-distance human migration', please return 'Y', \
    while if it qualifies as a 'short-distance human migration', please return 'N'. \
    We don't have any other category in this scheme like "medium-distance human migration". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments.
    """
    response = get_completion(prompt, provider, model)
    return response

In [27]:
def QA_7(provider, model, dist):
    prompt = f"""
    Please determine if a {dist}-kilometer permanent human migration under the influence of environmental or \
    climate change qualifies as a 'long-distance human migration' or a 'short-distance human migration'. \
    Output only one character: If it qualifies as a 'long-distance human migration', please return 'Y', \
    while if it qualifies as a 'short-distance human migration', please return 'N'. \
    We don't have any other category in this scheme like "medium-distance human migration". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments.
    """
    response = get_completion(prompt, provider, model)
    return response

In [28]:
def QA_8(provider, model, dist):
    prompt = f"""
    Please determine if a {dist}-kilometer internal human migration under the influence of environmental or \
    climate change qualifies as a 'long-distance human migration' or a 'short-distance human migration'. \
    Output only one character: If it qualifies as a 'long-distance human migration', please return 'Y', \
    while if it qualifies as a 'short-distance human migration', please return 'N'. \
    We don't have any other category in this scheme like "medium-distance human migration". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments.
    """
    response = get_completion(prompt, provider, model)
    return response

In [29]:
def QA_9(provider, model, dist):
    prompt = f"""
    Please determine if a {dist}-kilometer international human migration under the influence of environmental or \
    climate change qualifies as a 'long-distance human migration' or a 'short-distance human migration'. \
    Output only one character: If it qualifies as a 'long-distance human migration', please return 'Y', \
    while if it qualifies as a 'short-distance human migration', please return 'N'. \
    We don't have any other category in this scheme like "medium-distance human migration". \n
    Do not infer any missing information. Output only the single character without any other thing such as \
    explanations, opinions or meta-comments.
    """
    response = get_completion(prompt, provider, model)
    return response

In [73]:
def QA_to_df(provider, model, min_d, max_d, step, QA_number):
    results = []
    fail_count = 0
    dist = np.arange(min_d, max_d, step)

    for d in tqdm(dist):     
        for run_id in range(30):
            success = False
            while not success:
                try:
                    QA = globals()[f"QA_{QA_number}"]
                    result = {"answer": QA(provider, model, d)}
                    result["distance"] = d
                    result["model"] = model
                    result["run"] = run_id
                    results.append(result)
                    success = True
                except (RateLimitError, APIError, APITimeoutError, ResourceExhausted, ValueError) as e: # AuthenticationError, BadRequestError, NotFoundError
                    time.sleep(1)
                    fail_count = fail_count + 1
                    if fail_count >= 100:
                        raise ValueError(f"Can't get response: {result}, retried too many times") from e

    df = pd.DataFrame(results)
    df.to_csv(f"result/{provider}_Q{QA_number}.csv", index=False)
    
    return df

In all: 3 models, 9 scenarios, 1 temperature, 25 distances, 30 runs. In sum we need to call api for 20250 times, and will get a final result containing 20250 rows.

In [43]:
gpt51_1 = QA_to_df('gpt', 'gpt-5.1', 2, 52, 2, 1)

100%|██████████| 25/25 [16:24<00:00, 39.37s/it]


In [44]:
gpt51_2 = QA_to_df('gpt', 'gpt-5.1', 2, 52, 2, 2)

100%|██████████| 25/25 [16:56<00:00, 40.64s/it]


In [45]:
gpt51_3 = QA_to_df('gpt', 'gpt-5.1', 20, 520, 20, 3)

100%|██████████| 25/25 [16:20<00:00, 39.22s/it]


In [46]:
gpt51_4 = QA_to_df('gpt', 'gpt-5.1', 20, 520, 20, 4)

100%|██████████| 25/25 [16:57<00:00, 40.71s/it]


In [47]:
gpt51_5 = QA_to_df('gpt', 'gpt-5.1', 20, 520, 20, 5)

100%|██████████| 25/25 [17:00<00:00, 40.84s/it]


In [48]:
gpt51_6 = QA_to_df('gpt', 'gpt-5.1', 20, 520, 20, 6)

100%|██████████| 25/25 [14:39<00:00, 35.17s/it]


In [49]:
gpt51_7 = QA_to_df('gpt', 'gpt-5.1', 20, 520, 20, 7)

100%|██████████| 25/25 [16:19<00:00, 39.18s/it]


In [50]:
gpt51_8 = QA_to_df('gpt', 'gpt-5.1', 20, 520, 20, 8)

100%|██████████| 25/25 [16:42<00:00, 40.09s/it]


In [51]:
gpt51_9 = QA_to_df('gpt', 'gpt-5.1', 20, 520, 20, 9)

100%|██████████| 25/25 [16:45<00:00, 40.23s/it]


In [52]:
ds32_1 = QA_to_df('ds', 'deepseek-chat', 2, 52, 2, 1)

100%|██████████| 25/25 [16:09<00:00, 38.80s/it]


In [53]:
ds32_2 = QA_to_df('ds', 'deepseek-chat', 2, 52, 2, 2)

100%|██████████| 25/25 [16:08<00:00, 38.73s/it]


In [54]:
ds32_3 = QA_to_df('ds', 'deepseek-chat', 20, 520, 20, 3)

100%|██████████| 25/25 [16:43<00:00, 40.16s/it]


In [55]:
ds32_4 = QA_to_df('ds', 'deepseek-chat', 20, 520, 20, 4)

100%|██████████| 25/25 [16:06<00:00, 38.67s/it]


In [56]:
ds32_5 = QA_to_df('ds', 'deepseek-chat', 20, 520, 20, 5)

100%|██████████| 25/25 [16:40<00:00, 40.02s/it]


In [57]:
ds32_6 = QA_to_df('ds', 'deepseek-chat', 20, 520, 20, 6)

100%|██████████| 25/25 [15:46<00:00, 37.87s/it]


In [58]:
ds32_7 = QA_to_df('ds', 'deepseek-chat', 20, 520, 20, 7)

100%|██████████| 25/25 [16:33<00:00, 39.75s/it]


In [59]:
ds32_8 = QA_to_df('ds', 'deepseek-chat', 20, 520, 20, 8)

100%|██████████| 25/25 [16:17<00:00, 39.12s/it]


In [60]:
ds32_9 = QA_to_df('ds', 'deepseek-chat', 20, 520, 20, 9)

100%|██████████| 25/25 [16:22<00:00, 39.31s/it]


In [74]:
gemini_25_1 = QA_to_df('gemini', 'gemini-2.5-flash', 2, 52, 2, 1)

100%|██████████| 25/25 [17:04<00:00, 40.99s/it]


In [75]:
gemini_25_2 = QA_to_df('gemini', 'gemini-2.5-flash', 2, 52, 2, 2)

100%|██████████| 25/25 [16:44<00:00, 40.19s/it]


In [76]:
gemini_25_3 = QA_to_df('gemini', 'gemini-2.5-flash', 20, 520, 20, 3)

100%|██████████| 25/25 [14:36<00:00, 35.08s/it]


In [77]:
gemini_25_4 = QA_to_df('gemini', 'gemini-2.5-flash', 20, 520, 20, 4)

100%|██████████| 25/25 [14:15<00:00, 34.22s/it]


In [78]:
gemini_25_5 = QA_to_df('gemini', 'gemini-2.5-flash', 20, 520, 20, 5)

100%|██████████| 25/25 [14:05<00:00, 33.82s/it]


In [79]:
gemini_25_6 = QA_to_df('gemini', 'gemini-2.5-flash', 20, 520, 20, 6)

100%|██████████| 25/25 [14:50<00:00, 35.63s/it]


In [80]:
gemini_25_7 = QA_to_df('gemini', 'gemini-2.5-flash', 20, 520, 20, 7)

100%|██████████| 25/25 [14:18<00:00, 34.36s/it]


In [81]:
gemini_25_8 = QA_to_df('gemini', 'gemini-2.5-flash', 20, 520, 20, 8)

100%|██████████| 25/25 [15:23<00:00, 36.93s/it]


In [82]:
gemini_25_9 = QA_to_df('gemini', 'gemini-2.5-flash', 20, 520, 20, 9)

100%|██████████| 25/25 [14:42<00:00, 35.31s/it]


### Analysis

In [90]:
df_dict = {}

for file_name in os.listdir('result/'):
    if file_name.endswith('.csv'):
        file_path = os.path.join('result/', file_name)

        df = pd.read_csv(file_path)
        df['Q'] = file_name.replace('.csv', '')[-2:]
        df_name = file_name.replace('.csv', '')
        df_dict[df_name] = df

all_result = pd.concat(df_dict.values(), axis=0, ignore_index=True)

In [92]:
mapping_dict = {'N': 0, 'Y': 1}
all_result['answer_trans'] = all_result['answer'].map(mapping_dict).fillna(0.5)
all_result = all_result.drop(columns=['answer', 'run']).reset_index(drop=True)
grouped_result = all_result.groupby(['distance', 'model', 'Q']).agg({'answer_trans': 'mean'}).reset_index()
grouped_result

Unnamed: 0,distance,model,Q,answer_trans
0,2,deepseek-chat,Q1,0.0
1,2,deepseek-chat,Q2,0.0
2,2,gemini-2.5-flash,Q1,0.0
3,2,gemini-2.5-flash,Q2,0.0
4,2,gpt-5.1,Q1,0.0
5,2,gpt-5.1,Q2,0.0
6,4,deepseek-chat,Q1,0.033333
7,4,deepseek-chat,Q2,0.0
8,4,gemini-2.5-flash,Q1,0.0
9,4,gemini-2.5-flash,Q2,0.0


We plot the change of probability for a distance to be called 'long distance' by LLMs with distance, both model-wise and question-wise.

In [123]:
def plot(df, model, pos):    
    subset = df[(df['model'] == model)]
    
    plt.figure(figsize=(12,2))
    plt.title(f"{model}", fontsize=12, pad=10)

    Qs = sorted(subset['Q'].unique())
    colors = plt.cm.Set1(np.linspace(0, 1, len(Qs)))

    for i, Q in enumerate(Qs):
        Q_data = subset[subset['Q'] == Q]
        Q_data = Q_data.sort_values('distance')
        
        plt.plot(Q_data['distance'], Q_data['answer_trans'], marker='', linewidth=1.5, \
                color=colors[i], label=f'{Q}', antialiased=True)
    
    plt.xlabel('Distance', fontsize=10)
    plt.ylabel('\'Long-distance\' probability', fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.ylim(0, 1)
    plt.xticks(np.arange(0, 500, 100), fontsize=7)
    plt.yticks(np.arange(0, 1, 0.2), fontsize=7)
    plt.legend(title='Question number', title_fontsize=8, fontsize=7, loc='best', framealpha=0.9)
    filepath = os.path.join(pos, f"{model}.png")
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

In [124]:
for model in grouped_result['model'].unique():
    plot(grouped_result, model, 'plots/sce')

In [125]:
def plot2(df, Q, pos):    
    subset = df[(df['Q'] == Q)]
    
    plt.figure(figsize=(12,2))
    plt.title(f"{Q}", fontsize=12, pad=10)

    models = sorted(subset['model'].unique())
    colors = plt.cm.Set1(np.linspace(0, 1, len(models)))

    for i, model in enumerate(models):
        model_data = subset[subset['model'] == model]
        model_data = model_data.sort_values('distance')
        
        plt.plot(model_data['distance'], model_data['answer_trans'], marker='', linewidth=1.5, \
                color=colors[i], label=f'{model}', antialiased=True)
    
    plt.xlabel('Distance', fontsize=10)
    plt.ylabel('\'Long-distance\' probability', fontsize=10)
    plt.grid(True, alpha=0.3)
    plt.ylim(0, 1)
    plt.xticks(np.arange(0, 500, 100), fontsize=7)
    plt.yticks(np.arange(0, 1, 0.2), fontsize=7)
    plt.legend(title='Model', title_fontsize=8, fontsize=7, loc='best', framealpha=0.9)
    filepath = os.path.join(pos, f"{Q}.png")
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

In [126]:
for Q in grouped_result['Q'].unique():
    plot2(grouped_result, Q, 'plots/sce')