In [None]:
import pandas as pd
import numpy as np
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score
from sentence_transformers import SentenceTransformer, util

In [12]:
# Load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [13]:
# Scoring functions
def calculate_bleu(reference, hypothesis):
    return sentence_bleu([reference.split()], hypothesis.split())

def calculate_rouge(reference, hypothesis):
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, reference)[0]
    return scores['rouge-l']['f']  # Using ROUGE-L F1-score

def calculate_meteor(reference, hypothesis):
    return meteor_score([reference.split()], hypothesis.split())

def calculate_bert_similarity(reference, hypothesis):
    ref_embedding = model.encode(reference, convert_to_tensor=True)
    hyp_embedding = model.encode(hypothesis, convert_to_tensor=True)
    return util.pytorch_cos_sim(ref_embedding, hyp_embedding).item()

In [None]:
# Read DataFrame ('Reference' and 'Generated')
model = 'gpt-4o-mini'
reference_df = pd.read_csv('/projects/humansVsLLMs/data/goals_leader_with_demographics.csv')  # Replace with actual file path
generated_df = pd.read_csv(f'/projects/humansVsLLMs/results/{model}_responses.csv') # Replace with actual file path

In [None]:
# Initialize lists to store scores
bleu_scores, rouge_scores, meteor_scores, bert_scores_precision, bert_scores_recall, bert_scores_f1, bert_scores = [], [], [], [], [], [], []

# Iterate through each row and compute scores
for ref, gen in zip(reference_df['Leader_Action_Plans'], generated_df['Response']):
    bleu_scores.append(calculate_bleu(ref, gen))
    rouge_scores.append(calculate_rouge(ref, gen))
    meteor_scores.append(calculate_meteor(ref, gen))
    P, R, F1 = score(ref, gen, lang="en", model_type="bert-base-uncased")
    bert_scores_precision.append(P)
    bert_scores_recall.append(R)
    bert_scores_f1.append(F1)
    bert_scores.append(calculate_bert_similarity(ref, gen))

# Normalize scores
generated_df['BLEU'] = np.array(bleu_scores) / max(bleu_scores)
generated_df['ROUGE'] = np.array(rouge_scores) / max(rouge_scores)
generated_df['METEOR'] = np.array(meteor_scores) / max(meteor_scores)
generated_df['BERT_Score_Precision'] = np.array(bert_scores_precision) / max(bert_scores_precision)
generated_df['BERT_Score_Recall'] = np.array(bert_scores_recall) / max(bert_scores_recall)
generated_df['BERT_Score_F1'] = np.array(bert_scores_f1) / max(bert_scores_f1)

# Save results
generated_df.to_csv('output_scores.csv', index=False)
print("Scores saved to output_scores.csv")


# TTR, MATTR, Readability

In [None]:
import pandas as pd
import nltk
from textstat import flesch_reading_ease
from lexicalrichness import LexicalRichness

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

In [None]:

def load_candidate(file_path):
    df = pd.read_csv(file_path)
    return df

In [None]:

def calculate_readability_lexical(candidate_df, text_col='Response'):
    readability = []
    ttr_scores = []
    mattr_scores = []

    for text in candidate_df[text_col]:
        readability.append(flesch_reading_ease(text))
        lex = LexicalRichness(text)
        ttr_scores.append(lex.ttr)
        mattr_scores.append(lex.mattr(window_size=3))  

    return readability, ttr_scores, mattr_scores

In [None]:

def compile_scores(readability, ttr, mattr, metric_scores=None):
    if metric_scores:
        df_scores = pd.DataFrame(metric_scores)
    else:
        df_scores = pd.DataFrame()
        df_scores['Readability'] = readability
        df_scores['TTR'] = ttr
        df_scores['MATTR'] = mattr
    return df_scores

In [None]:

def main(model_name):
    candidate_df = load_candidate(f'/projects/humansVsLLMs/results/0-shot-generated-responses/{model_name}_generated_responses.csv')
    print(candidate_df.shape)
    readability, ttr, mattr = calculate_readability_lexical(candidate_df)
    df_scores = compile_scores(readability, ttr, mattr)
    # Save or display results
    df_scores.to_csv(f'/projects/humansVsLLMs/results/semantic_analysis/0-shot/{model_name}_evaluation_scores.csv', index=False)
    print(df_scores.shape)

In [None]:
if __name__ == "__main__":
    main(model_name='gemini')

# Gender Comparison

In [None]:
def calculate_readability_lexical(response_texts):
    readability = []
    ttr_scores = []
    mattr_scores = []

    for text in response_texts:
        readability.append(flesch_reading_ease(text))
        lex = LexicalRichness(text)
        ttr_scores.append(lex.ttr)
        mattr_scores.append(lex.mattr(window_size=2))  
    return readability, ttr_scores, mattr_scores

def compile_scores(readability, ttr, mattr):
    df_scores = pd.DataFrame()
    df_scores['Readability'] = readability
    df_scores['TTR'] = ttr
    df_scores['MATTR'] = mattr
    return df_scores

In [None]:
# For gender wise comparison
gender = "NB"
path = '/projects/humansVsLLMs/data'
df = pd.read_csv(f"{path}/data_leaders_with_demographics_semantics.csv")  # Replace with your file
df = df[~df['GenderIdentity'].isin(["Female", "Male"])]
response_texts_uniqueness = df['firstTaskGoal'].dropna().astype(str).to_list()
print(len(response_texts_uniqueness))
response_texts_belongingness = df['addFirstRelGoal'].dropna().astype(str).to_list()
print(len(response_texts_belongingness))
response_texts = response_texts_belongingness + response_texts_uniqueness
print(len(response_texts))

# metrics = calculate_metrics(reference_df, candidate_df)
readability, ttr, mattr = calculate_readability_lexical(response_texts)
df_scores = compile_scores(readability, ttr, mattr)

# Save or display results
df_scores.to_csv(f'/projects/humansVsLLMs/results/semantic_analysis/{gender}_evaluation_scores.csv', index=False)
print(df_scores.shape)