# Get Action Plans of leaders with demographics

In [None]:
import pandas as pd
import numpy as np

In [None]:
model = 'mistral'
df = pd.read_csv(f'/projects/humansVsLLMs/results/{model}_generated_responses.csv')
df.shape

In [None]:
df['Response'].iloc[4]

In [None]:
real_life_leaders_demographics_df = pd.read_excel('/projects/humansVsLLMs/data/Inclusion Chatbot (LeaderDemographics)_February 20, 2025_06.43.xlsx')

In [None]:
all_data = pd.read_excel('/projects/humansVsLLMs/data/Oct22_Jap_goals.xlsx')
all_data.shape

(1693, 13)

In [None]:
real_life_leaders_df = all_data[all_data['uniqueId'].isin(list_of_ids)] # filter participants with socio-demographic data present
real_life_leaders_df.shape

(1154, 13)

In [None]:
real_life_leaders_df.to_csv('/projects/humansVsLLMs/data/data_leaders_with_demographics.csv', index=False)

In [116]:
selected_columns = ['firstTaskGoal',
       'addTaskGoals', 'addFirstRelGoal', 'addRelGoals', 'addRelGoalsLater']

In [None]:
df_selected = real_life_leaders_df[selected_columns]
df_selected.tail(5)

In [None]:
# Remove NaN and numeric-only values in each cell
def clean_cell(cell):
    if pd.isna(cell):
        return np.nan
    if isinstance(cell, (int, float)) or str(cell).strip().replace('.', '', 1).isdigit():
        return np.nan
    return str(cell).strip()

df_cleaned = df_selected.applymap(clean_cell)

# Combine non-NaN values from the selected columns into a single column 
final_series = df_cleaned.stack().reset_index(drop=True)
final_series = final_series.dropna()


In [None]:
final_series.to_frame(name='Leader_Action_Plans')

In [None]:
# Save to CSV
final_series.to_frame(name='Leader_Action_Plans').to_csv("/projects/humansVsLLMs/data/goals_leader_with_demographics.csv", index=False)

# Data Prep for Semantic Analysis

In [None]:
df = pd.read_csv('/projects/humansVsLLMs/data/data_leaders_with_demographics.csv')
df.head()

In [122]:
# Initialize the columns with default values
df['Inclusion_Dimension_1'] = None
df['Inclusion_Dimension_2'] = None

# Loop through unique IDs
for uid in df['uniqueId'].unique():
    temp_df = df[df['uniqueId'] == uid]

    for idx, row in temp_df.iterrows():
        # Logic for Inclusion_Dimension_1
        if row['stageName'] == 'Goal-Setting' and pd.notna(row['firstTaskGoal']):
            df.at[idx, 'Inclusion_Dimension_1'] = 'Uniqueness'
        elif row['stageName'] == 'Goal-Setting-Belongingness' and pd.notna(row['firstTaskGoal']):
            df.at[idx, 'Inclusion_Dimension_1'] = 'Belongingness'

        # Logic for Inclusion_Dimension_2
        if row['stageName'] == 'Goal-Setting' and pd.notna(row['addFirstRelGoal']):
                df.at[idx, 'Inclusion_Dimension_2'] = 'Appreciation'
        elif row['stageName'] == 'Goal-Setting-Belongingness' and pd.notna(row['addFirstRelGoal']):
                df.at[idx, 'Inclusion_Dimension_2'] = 'OrgEfforts'


In [None]:
df.head(20)

In [138]:
# Ensure both uniqueId column and ordered_ids list are the same type
ordered_ids = list_of_ids
df['uniqueId'] = df['uniqueId'].astype(int)
ordered_ids = [int(uid) for uid in ordered_ids]

# Create a mapping from uniqueId to its order index
id_order = {uid: index for index, uid in enumerate(ordered_ids)}

# Map the sort index to a new column
df['sort_key'] = df['uniqueId'].map(id_order)

# Drop rows with NaN sort_key (i.e., uniqueIds not in ordered_ids), if any
df = df.dropna(subset=['sort_key'])

# Sort in ascending order of sort_key and clean up
df_sorted_new = df.sort_values(by='sort_key', ascending=True).reset_index(drop=True)


In [125]:
# Sort in ascending order of sort_key and clean up
df_sorted_new = df.sort_values(by='sort_key', ascending=True)

In [None]:
index_to_value = {idx: val for idx, val in enumerate(p_ids)}

# Convert sort_key to int for indexing
df_sorted_new['sort_key'] = df_sorted_new['sort_key'].astype(int)

# Add new column based on sort_key index
df_sorted_new['PID'] = df_sorted_new['sort_key'].map(index_to_value)

In [None]:
final_df = pd.merge(df_sorted_new, real_life_leaders_demographics_df, how='left', on=['PID'])
final_df.shape

In [None]:
final_df.to_csv('/projects/humansVsLLMs/data/data_leaders_with_demographics_semantics.csv', index=False)

# Data prep for human evaluation 

In [None]:
file_path = '/projects/humansVsLLMs/results/3-shot-generated-responses'
models = ['cohere', 'deepseek', 'gemini', 'gpt-4o-mini', 'llama', 'mistral', 'qwen']
final_df = pd.DataFrame()
for model in models:
    df = pd.read_csv(f'{file_path}/{model}_generated_responses.csv')
    df['Length'] = df['Response'].map(lambda a: len(str(a)))
    df['model_name'] = model
    new_df = df[df['Length'] < 1500]
    final_df = pd.concat([final_df, new_df], ignore_index=True)
    print(f'Shape of original {model}_df: {df.shape} and the Shape of new df: {new_df.shape}')
print(f'Shape of Final df: {final_df.shape}')
final_df.to_csv('/projects/humansVsLLMs/data/human_evaluators_sample.csv', index=False)

In [None]:
file_path = '/projects/humansVsLLMs/results/3-shot-generated-responses'
models = ['cohere', 'deepseek', 'gemini', 'gpt-4o-mini', 'llama', 'mistral', 'qwen']
df_llm_texts = pd.DataFrame()
for model in models:
    df = pd.read_csv(f'{file_path}/{model}_generated_responses.csv')
    df['Length'] = df['Response'].map(lambda a: len(str(a)))
    df['model_name'] = model
    df_llm_texts = pd.concat([df_llm_texts, df], ignore_index=True)
    print(f'Shape of original {model}_df: {df.shape}')
print(f'Shape of Final df: {df_llm_texts.shape}')

In [None]:
file_path = '/projects/humansVsLLMs/results/3-shot-generated-responses'
models = ['cohere', 'deepseek', 'gemini', 'gpt-4o-mini', 'llama', 'mistral', 'qwen']
final_df = pd.DataFrame()
for model in models:
    df = pd.read_csv(f'{file_path}/{model}_generated_responses.csv')
    df['Length'] = df['Response'].map(lambda a: len(str(a)))
    df['model_name'] = model
    new_df = df[df['Length'] < 1500]
    final_df = pd.concat([final_df, new_df], ignore_index=True)
    print(f'Shape of original {model}_df: {df.shape} and the Shape of new df: {new_df.shape}')
print(f'Shape of Final df: {final_df.shape}')

In [None]:
# Read the CSV files into DataFrames
df_real_life_leader = pd.read_csv('/projects/humansVsLLMs/data/data_leaders_with_demographics_semantics.csv')  # Replace with your file path
# df_llm_texts = pd.read_csv('/projects/humansVsLLMs/data/human_evaluators_sample.csv')               # Replace with your file path

# Extract first 200 characters from the 4 target columns in df_real_life_leader
target_columns = [
    'Generated_Prompt_Uniqueness',
    'Generated_Prompt_Belongingness',
    'Generated_Prompt_Appreciation',
    'Generated_Prompt_OrgEfforts'
]

# Get all unique first 200 characters from the 4 columns
matching_200_chars = set()
for col in target_columns:
    # Drop NaN values and take first 200 chars of each non-NaN entry
    prompts_200_chars = df_real_life_leader[col].dropna().apply(lambda x: str(x)[:150])
    matching_200_chars.update(prompts_200_chars.unique())

# Filter df_llm_texts to retain rows where first 200 chars of 'Prompt' match
df_llm_texts['Prompt_200_chars'] = df_llm_texts['Prompt'].apply(lambda x: str(x)[:150])
df_llm_texts_filtered = df_llm_texts[df_llm_texts['Prompt_200_chars'].isin(matching_200_chars)].copy()

# Reorder df_llm_texts_filtered to match df_real_life_leader's prompt order
# Create a mapping of first 200 chars to their first occurrence index in df_real_life_leader
prompt_order_mapping = {}
for idx, row in df_real_life_leader.iterrows():
    for col in target_columns:
        prompt = str(row[col])[:150] if pd.notna(row[col]) else None
        if prompt and prompt not in prompt_order_mapping:
            prompt_order_mapping[prompt] = idx  # Track first occurrence

# Assign 'order' based on the mapping and sort
df_llm_texts_filtered['order'] = df_llm_texts_filtered['Prompt_200_chars'].map(prompt_order_mapping)
df_llm_texts_filtered.sort_values('order', inplace=True)
df_llm_texts_filtered.drop(['order', 'Prompt_200_chars'], axis=1, inplace=True)  # Cleanup

# Reset index
df_llm_texts_filtered.reset_index(drop=True, inplace=True)
