In [5]:
#!/usr/bin/env python
# -*-coding:utf-8 -*-
'''
@File    :   run_simulation_gss.py
@Time    :   2025/06/08 21:41:40
@Author  :   Shijian Liu
@Version :   1.0
@Contact :   lshijian405@gmail.com
@Desc    :   This script runs a simulation for survey responses on the GSS (General Social Survey) data using a predefined participant pool and survey context.
'''
from simulate_response import run_all_survey_responses_str
from llm_openai import openai_llm
import pandas as pd

# response template
with open("survey_response_template_gss.txt", "r") as f:
    survey_template = f.read()
print(survey_template)

# survey questions dataframe. Columns: Variable_Name, Average_Human_Response, Question
df_gss = pd.read_csv("shijian_survey_data_1.csv")
df_gss.head()

You are a $age-year-old $gender identifying as $race. Please answer the following survey question. Please answer the questions by returning ONLY the corresponding number of your choice. Your answer should be purely a number.


Unnamed: 0,Variable_Name,Average_Human_Response,Question
0,polviews,4.086482,We hear a lot of talk these days about liberal...
1,natsoc,1.47733,We are faced with many problems in this countr...
2,natchld,1.496186,We are faced with many problems in this countr...
3,natsci,1.688086,We are faced with many problems in this countr...
4,equal4,2.345886,It is the responsibility of government to meet...


In [None]:
from tqdm import tqdm
import re

# For test only:
# df_gss = df_gss.head(3)
# Run the simulation for each survey question in the GSS dataset. Keep average of LLM responses.
average_llm_responses = []

for idx, row in tqdm(df_gss.iterrows(), total=len(df_gss), desc="Simulating survey responses"):
    survey_str = row['Question']
    responses_df = run_all_survey_responses_str(
        llm=openai_llm,
        participant_csv_path="participant_pool.csv",
        survey_prompt_template=survey_template,
        survey_str=survey_str
    )
    # print(responses_df.head())  # Print the first few rows of the responses DataFrame for debugging
    # Convert responses to numeric if needed int(re.search(r'^\s*(\d+)', r).group(1))
    responses_df['choice_number'] = responses_df['Response'].apply(
        lambda x: re.search(r'^\s*(\d+)', x).group(1) if isinstance(x, str) and re.search(r'^\s*(\d+)', x) else None
    )
    responses_numeric = pd.to_numeric(responses_df['choice_number'], errors='coerce')
    avg_response = responses_numeric.mean()
    average_llm_responses.append(avg_response)
    # Optionally, save each responses_df if you want:
    # responses_df.to_csv(f"simulated_responses_{row['Variable_Name']}.csv", index=False)

df_gss['Average_LLM_Response'] = average_llm_responses
df_gss.to_csv("gss_with_llm_responses.csv", index=False)
print("Saved GSS with LLM average responses to gss_with_llm_responses.csv")


100%|██████████| 6/6 [00:07<00:00,  1.22s/it] 0/3 [00:00<?, ?it/s]
Simulating survey responses:  33%|███▎      | 1/3 [00:07<00:14,  7.34s/it]

                          ParticipantID  Age  Gender        Race  \
0  f9e9d992-9c8b-47b2-832a-ba8edb55d810   62  Female  Indigenous   
1  b7163744-b2b5-4a45-b333-88c6cd63146c   24    Male       White   
2  ee64c9a6-f560-41f8-b823-275cf3131668   53  Female       Asian   
3  c4c897b3-7658-4103-b88b-34425ccb8521   28  Female       Asian   
4  21b3f469-22a2-445e-bdb6-fc576f3e9232   31    Male       Asian   

                         Response  
0  4-Moderate, middle of the road  
1  4-Moderate, middle of the road  
2  4-Moderate, middle of the road  
3  4-Moderate, middle of the road  
4                               4  


100%|██████████| 6/6 [00:02<00:00,  2.93it/s]
Simulating survey responses:  67%|██████▋   | 2/3 [00:09<00:04,  4.23s/it]

                          ParticipantID  Age  Gender        Race  \
0  f9e9d992-9c8b-47b2-832a-ba8edb55d810   62  Female  Indigenous   
1  b7163744-b2b5-4a45-b333-88c6cd63146c   24    Male       White   
2  ee64c9a6-f560-41f8-b823-275cf3131668   53  Female       Asian   
3  c4c897b3-7658-4103-b88b-34425ccb8521   28  Female       Asian   
4  21b3f469-22a2-445e-bdb6-fc576f3e9232   31    Male       Asian   

         Response  
0  2- ABOUT RIGHT  
1  2- ABOUT RIGHT  
2  2- ABOUT RIGHT  
3  2- ABOUT RIGHT  
4               2  


100%|██████████| 6/6 [00:02<00:00,  2.89it/s]
Simulating survey responses: 100%|██████████| 3/3 [00:11<00:00,  3.83s/it]

                          ParticipantID  Age  Gender        Race  \
0  f9e9d992-9c8b-47b2-832a-ba8edb55d810   62  Female  Indigenous   
1  b7163744-b2b5-4a45-b333-88c6cd63146c   24    Male       White   
2  ee64c9a6-f560-41f8-b823-275cf3131668   53  Female       Asian   
3  c4c897b3-7658-4103-b88b-34425ccb8521   28  Female       Asian   
4  21b3f469-22a2-445e-bdb6-fc576f3e9232   31    Male       Asian   

         Response  
0  2- ABOUT RIGHT  
1     3- TOO MUCH  
2   1- TOO LITTLE  
3   1- TOO LITTLE  
4  2- ABOUT RIGHT  
Saved GSS with LLM average responses to gss_with_llm_responses.csv



