In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
nltk.download('punkt')

In [None]:
# Load Warriner et al. VAD Lexicon
vad_lexicon = pd.read_csv('https://raw.githubusercontent.com/cwlab-vu/VAD-lexicon/master/Warriner_et_al.csv')
vad_lexicon = vad_lexicon.set_index('Word')  # Faster lookup

In [None]:
# Load your DataFrame
model = 'qwen'
path = '/projects/humansVsLLMs/results/3-shot-generated-responses'
df = pd.read_csv(f"{path}/{model}_generated_responses.csv")  # Replace with your file
texts = df['Response'].dropna().astype(str)

In [None]:
path = '/projects/humansVsLLMs/data'
df = pd.read_csv(f"{path}/goals_leader_with_demographics.csv")  # Replace with your file
texts = df['Leader_Action_Plans'].dropna().astype(str)

In [None]:
# Initialize cumulative scores
valence_sum = 0
arousal_sum = 0
dominance_sum = 0
word_count = 0
file_path = '/projects/humansVsLLMs/plots'
# Go through each row
for text in texts:
    tokens = word_tokenize(text.lower())
    for token in tokens:
        if token in vad_lexicon.index:
            valence_sum += vad_lexicon.loc[token, 'V.Mean.Sum']
            arousal_sum += vad_lexicon.loc[token, 'A.Mean.Sum']
            dominance_sum += vad_lexicon.loc[token, 'D.Mean.Sum']
            word_count += 1

# Calculate averages
valence_avg = valence_sum / word_count if word_count > 0 else 0
arousal_avg = arousal_sum / word_count if word_count > 0 else 0
dominance_avg = dominance_sum / word_count if word_count > 0 else 0

# Create results DataFrame
results = pd.DataFrame({
    'Dimension': ['Valence', 'Arousal', 'Dominance'],
    'Average Score': [valence_avg, arousal_avg, dominance_avg]
})

# --------- Visualize & Save as SVG ---------
plt.figure(figsize=(8, 6))
plot = sns.barplot(x='Dimension', y='Average Score', data=results, palette='Set2')
plt.title("Average Valence, Arousal, and Dominance Scores")
plt.ylim(0, 9)  # VAD scores typically range from 1 to 9

# Save as SVG
plt.tight_layout()
plt.savefig(f"{file_path}/{model}_vad_scores.svg", format='svg')
plt.show()


print(results)


In [None]:


# Step 1: Read .txt file as vad_df (assuming tab-delimited or space-delimited)
vad_df = pd.read_csv("/projects/humansVsLLMs/data/VAD/NRC-VAD-Lexicon-v2.1/NRC-VAD-Lexicon-v2.1/NRC-VAD-Lexicon-v2.1.txt", sep="\t", on_bad_lines='skip')  # or sep="\t" depending on the format
vad_df.columns = ['term', 'valence', 'arousal', 'dominance']
vad_df['term'] = vad_df['term'].str.lower()

# Step 2: Read .csv file as rll_df
rll_df = pd.read_csv("/projects/humansVsLLMs/data/data_leaders_with_demographics_semantics.csv")
rll_df = rll_df[~rll_df['GenderIdentity'].isin(['Male', 'Female'])]
print(rll_df.shape)
# Step 3: Define function to extract VAD scores
def extract_vad_scores(text, vad_lookup):
    if pd.isna(text):
        return np.nan, np.nan, np.nan

    words = text.lower().split()
    matched = vad_lookup[vad_lookup['term'].isin(words)]
    
    if matched.empty:
        return np.nan, np.nan, np.nan
    else:
        return (
            matched['valence'].mean(),
            matched['arousal'].mean(),
            matched['dominance'].mean()
        )

# Step 4: Apply the function row-wise
scores_uniq = rll_df['firstTaskGoal'].apply(lambda x: extract_vad_scores(x, vad_df))
scores_belong = rll_df['addFirstRelGoal'].apply(lambda x: extract_vad_scores(x, vad_df))
final_scores = scores_uniq.tolist() + scores_belong.to_list()
rll_df[['valence_score', 'arousal_score', 'dominance_score']] = pd.DataFrame(final_scores)

# Step 5: Aggregate scores
aggregated_scores = rll_df[['valence_score', 'arousal_score', 'dominance_score']].mean()

# Display result
print("Aggregated VAD Scores:")
print(aggregated_scores)


In [None]:
#LLMs
model = 'gemini'
path = '/projects/humansVsLLMs/results/3-shot-generated-responses'


# Step 1: Read .txt file as vad_df (assuming tab-delimited or space-delimited)
vad_df = pd.read_csv("/projects/humansVsLLMs/data/VAD/NRC-VAD-Lexicon-v2.1/NRC-VAD-Lexicon-v2.1/NRC-VAD-Lexicon-v2.1.txt", sep="\t", on_bad_lines='skip')  # or sep="\t" depending on the format
vad_df.columns = ['term', 'valence', 'arousal', 'dominance']
vad_df['term'] = vad_df['term'].str.lower()

# Step 2: Read .csv file as llm_df
llm_df = pd.read_csv(f"{path}/{model}_generated_responses.csv")

# Step 3: Define function to extract VAD scores
def extract_vad_scores(text, vad_lookup):
    if pd.isna(text):
        return np.nan, np.nan, np.nan

    words = text.lower().split()
    matched = vad_lookup[vad_lookup['term'].isin(words)]
    
    if matched.empty:
        return np.nan, np.nan, np.nan
    else:
        return (
            matched['valence'].mean(),
            matched['arousal'].mean(),
            matched['dominance'].mean()
        )

# Step 4: Apply the function row-wise
scores = llm_df['Response'].apply(lambda x: extract_vad_scores(x, vad_df))
llm_df[['valence_score', 'arousal_score', 'dominance_score']] = pd.DataFrame(scores.to_list())

# Step 5: Aggregate scores
aggregated_scores = llm_df[['valence_score', 'arousal_score', 'dominance_score']].mean()

# Display result
print(f"Aggregated VAD Scores for {model}:")
print(aggregated_scores)

