In [2]:
from itertools import chain
import nltk
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from collections import defaultdict

In [None]:
# For gender wise comparison
path = '/projects/humansVsLLMs/data'
df = pd.read_csv(f"{path}/data_leaders_with_demographics_semantics.csv")  # Replace with your file
df = df[~df['GenderIdentity'].isin(['Male', 'Female'])]
response_texts_uniqueness = df['firstTaskGoal'].dropna().astype(str).to_list()
print(len(response_texts_uniqueness))
response_texts_belongingness = df['addFirstRelGoal'].dropna().astype(str).to_list()
print(len(response_texts_belongingness))
response_texts = response_texts_belongingness + response_texts_uniqueness
print(len(response_texts))

In [None]:
# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [4]:
def extract_top_nouns_verbs(texts, top_n=20):
    # Initialize counters
    nouns = defaultdict(int)
    verbs = defaultdict(int)
    stop_words = set(stopwords.words('english'))
    
    for text in texts:
        # Tokenize and POS tag each text
        tokens = word_tokenize(text.lower())  # Lowercase for consistency
        tagged = pos_tag(tokens)
        
        for word, tag in tagged:
            # Filter out stopwords and non-alphabetic tokens
            if word.isalpha() and word not in stop_words:
                # Check for nouns (NN, NNS, NNP, NNPS)
                if tag.startswith('NN'):
                    nouns[word] += 1
                # Check for verbs (VB, VBD, VBG, VBN, VBP, VBZ)
                elif tag.startswith('VB'):
                    verbs[word] += 1
    
    # Sort and return top N nouns/verbs
    top_nouns = sorted(nouns.items(), key=lambda x: x[1], reverse=True)[:top_n]
    top_verbs = sorted(verbs.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    return top_nouns, top_verbs

In [None]:
# RLL usage
top_nouns, top_verbs = extract_top_nouns_verbs(response_texts, top_n=20)
print("Top 20 Nouns:", top_nouns)
print("Top 20 Verbs:", top_verbs)

In [None]:
#For LLM generated responses
path = '/projects/humansVsLLMs/results/3-shot-generated-responses'
models = ['cohere', 'deepseek', 'gemini', 'gpt-4o-mini', 'llama', 'mistral', 'qwen']
for model in models:
    df = pd.read_csv(f'{path}/{model}_generated_responses.csv')
    response_texts = df['Response'].to_list()
    print(len(response_texts))
    top_nouns, top_verbs = extract_top_nouns_verbs(response_texts, top_n=20)
    print(f"{model}: {top_nouns}: {top_verbs}")
    print("Top 20 Nouns:", top_nouns)
    print("Top 20 Verbs:", top_verbs)
    

In [None]:
# For age wise comparison
path = '/projects/humansVsLLMs/data'
df = pd.read_csv(f"{path}/data_leaders_with_demographics_semantics.csv")  # Replace with your file
df = df[df['Age'].isin(['61',
 '62',
 '64',
 ])]
response_texts_uniqueness = df['firstTaskGoal'].dropna().astype(str).to_list()
print(len(response_texts_uniqueness))
response_texts_belongingness = df['addFirstRelGoal'].dropna().astype(str).to_list()
print(len(response_texts_belongingness))
response_texts = response_texts_belongingness + response_texts_uniqueness
print(len(response_texts))

In [None]:
# RLL usage
top_nouns, top_verbs = extract_top_nouns_verbs(response_texts, top_n=20)
print("Top 20 Nouns:", top_nouns)
print("Top 20 Verbs:", top_verbs)