# BLEU Scores

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def calculate_model_bleu_scores(df):
    """
    Calculate BLEU scores for each model based on the specified conditions.
    
    Args:
        df: DataFrame containing the evaluation data
        
    Returns:
        Dictionary with aggregated BLEU scores per model
    """
    # Check required columns
    required_columns = ['model_name', 'Dimension', 'firstTaskGoal', 'addTaskGoals', 'Selected_Text']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"DataFrame is missing required column: {col}")
    
    # Initialize results dictionary and smoothing function
    results = {}
    smoother = SmoothingFunction().method1
    
    # Get unique models
    models = df['model_name'].unique()
    
    for model in models:
        model_df = df[df['model_name'] == model]
        bleu_scores = []
        
        for _, row in model_df.iterrows():
            # Determine reference text based on Dimension
            if row['Dimension'] in ['Uniqueness', 'Appreciation']:
                reference_text = row['firstTaskGoal'] if pd.notna(row['firstTaskGoal']) else row['addTaskGoals']
            else:
                reference_text = row['addFirstRelGoal'] if pd.notna(row['addFirstRelGoal']) else row['addRelGoals']  # Skip if Dimension doesn't match
                
            # Get candidate text
            candidate_text = row['Selected_Text']
            
            # Skip if either text is missing
            if pd.isna(reference_text) or pd.isna(candidate_text):
                continue
                
            # Tokenize texts
            try:
                reference_tokens = [word_tokenize(str(reference_text).lower())]
                candidate_tokens = word_tokenize(str(candidate_text).lower())
                
                # Calculate BLEU score
                score = sentence_bleu(
                    reference_tokens, 
                    candidate_tokens,
                    smoothing_function=smoother
                )
                bleu_scores.append(score)
            except:
                continue  # Skip if tokenization fails
        
        # Store results for this model
        if bleu_scores:
            results[model] = {
                'average_bleu': sum(bleu_scores) / len(bleu_scores),
                'min_bleu': min(bleu_scores),
                'max_bleu': max(bleu_scores),
                'median_bleu': sorted(bleu_scores)[len(bleu_scores) // 2],
                'num_comparisons': len(bleu_scores),
                'all_scores': bleu_scores
            }
        else:
            results[model] = {
                'average_bleu': None,
                'min_bleu': None,
                'max_bleu': None,
                'median_bleu': None,
                'num_comparisons': 0,
                'all_scores': []
            }
    
    return results

def read_and_process_file(file_path):
    """
    Read and process the input file.
    
    Args:
        file_path: Path to the input file (CSV or Excel)
        
    Returns:
        Processed DataFrame
    """
    # Read file based on extension
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith(('.xls', '.xlsx')):
        df = pd.read_excel(file_path, sheet_name='Final_Selection')
    else:
        raise ValueError("Unsupported file format. Please provide CSV or Excel file.")
    
    # Ensure required columns exist
    required_columns = ['model_name', 'Dimension', 'firstTaskGoal', 'addTaskGoals', 'addFirstRelGoal', 'addRelGoals', 'Selected_Text']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Input file is missing required column: {col}")
    
    return df



def main():
    # Example usage
    file_path = '/projects/humansVsLLMs/data/EMNLP-HUmanEvaluation.xlsx'  # Change to your file path
    try:
        # Read and process the file
        df = read_and_process_file(file_path)
        
        # Calculate BLEU scores
        results = calculate_model_bleu_scores(df)
        
        # Print results
        print("BLEU Score Aggregation by Model:")
        print("=" * 50)
        for model, scores in results.items():
            print(f"\nModel: {model}")
            print("-" * 30)
            if scores['num_comparisons'] > 0:
                print(f"Average BLEU: {scores['average_bleu']:.4f}")
                print(f"Minimum BLEU: {scores['min_bleu']:.4f}")
                print(f"Maximum BLEU: {scores['max_bleu']:.4f}")
                print(f"Median BLEU: {scores['median_bleu']:.4f}")
                print(f"Number of comparisons: {scores['num_comparisons']}")
            else:
                print("No valid comparisons found for this model.")
        
        # Optionally save results to a file
        # results_df = pd.DataFrame.from_dict(results, orient='index')
        # results_df.to_csv('bleu_scores_results.csv')
        
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

# Sentence Similarity

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')

# Initialize sentence transformer model (using a lightweight model)
model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_sentence_similarity(reference_text, candidate_text):
    """Calculate cosine similarity between sentence embeddings."""
    if pd.isna(reference_text) or pd.isna(candidate_text):
        return np.nan
    
    try:
        # Encode both sentences
        embeddings = model.encode([str(reference_text), str(candidate_text)])
        # Calculate cosine similarity
        similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        return similarity
    except Exception as e:
        print(f"Error calculating similarity: {e}")
        return np.nan

def calculate_model_similarities(df):
    """Calculate sentence similarities for each model based on specified conditions."""
    required_columns = ['model_name', 'Dimension', 'firstTaskGoal', 'addTaskGoals', 'Selected_Text']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"DataFrame is missing required column: {col}")
    
    results = {}
    
    for model in df['model_name'].unique():
        model_df = df[df['model_name'] == model]
        similarities = []
        
        for _, row in model_df.iterrows():
            if row['Dimension'] in ['Uniqueness', 'Appreciation']:
                reference_text = row['firstTaskGoal'] if pd.notna(row['firstTaskGoal']) else row['addTaskGoals']
            else:
                reference_text = row['addFirstRelGoal'] if pd.notna(row['addFirstRelGoal']) else row['addRelGoals']  # Skip if Dimension doesn't match

                
            candidate_text = row['Selected_Text']
            similarity = calculate_sentence_similarity(reference_text, candidate_text)
            
            if not np.isnan(similarity):
                similarities.append(similarity)
        
        results[model] = similarities if similarities else None
    
    return results

def visualize_similarities(results, output_file='model_similarities.svg'):
    """Visualize sentence similarities across models and save as SVG."""
    # Prepare data for visualization
    viz_data = []
    for model, scores in results.items():
        if scores:
            for score in scores:
                viz_data.append({'Model': model, 'Similarity': score})
    
    if not viz_data:
        print("No valid similarity scores to visualize")
        return
    
    df_viz = pd.DataFrame(viz_data)
    
    # Create figure
    plt.figure(figsize=(12, 6))
    sns.set_style("whitegrid")
    
    # Create boxplot with jittered points
    ax = sns.boxplot(x='Model', y='Similarity', data=df_viz, width=0.6, showfliers=False)
    sns.stripplot(x='Model', y='Similarity', data=df_viz, 
                 color='black', alpha=0.3, jitter=True, size=4, ax=ax)
    
    # Add average markers
    averages = df_viz.groupby('Model')['Similarity'].mean()
    for i, model in enumerate(averages.index):
        ax.plot([i-0.2, i+0.2], [averages[model], averages[model]], 
               color='red', linestyle='-', linewidth=2)
    
    # Customize plot
    plt.title('Sentence Similarity Distribution by Model', fontsize=14, pad=20)
    plt.xlabel('Model', fontsize=12)
    plt.ylabel('Cosine Similarity', fontsize=12)
    plt.ylim(0, 1)  # Similarity ranges from 0 to 1
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    
    # Save as SVG
    plt.savefig(output_file, format='svg', bbox_inches='tight')
    print(f"Visualization saved as {output_file}")
    plt.close()

def read_and_process_file(file_path):
    """Read and process the input file."""
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith(('.xls', '.xlsx')):
        df = pd.read_excel(file_path, sheet_name='Final_Selection')
    else:
        raise ValueError("Unsupported file format. Please provide CSV or Excel file.")
    
    required_columns = ['model_name', 'Dimension', 'firstTaskGoal', 'addTaskGoals', 'addFirstRelGoal', 'addRelGoals', 'Selected_Text']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Input file is missing required column: {col}")
    
    return df

def main():
    file_path = '/projects/humansVsLLMs/data/EMNLP-HUmanEvaluation.xlsx'  # Change to your file path
    output_plot = '/projects/humansVsLLMs/plots/model_similarities.svg'
    
    try:
        df = read_and_process_file(file_path)
        results = calculate_model_similarities(df)
        
        # Print summary statistics
        print("Sentence Similarity Summary by Model:")
        print("=" * 50)
        for model, scores in results.items():
            if scores:
                print(f"\nModel: {model}")
                print(f"  Count: {len(scores)}")
                print(f"  Average: {np.mean(scores):.4f}")
                print(f"  Min: {np.min(scores):.4f}")
                print(f"  Max: {np.max(scores):.4f}")
                print(f"  Std Dev: {np.std(scores):.4f}")
            else:
                print(f"\nModel: {model} - No valid scores")
        
        # Create and save visualization
        visualize_similarities(results, output_plot)
        
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

# ROUGE Scores

In [None]:
import pandas as pd
from rouge_score import rouge_scorer
import statistics as s
import nltk
nltk.download('punkt')



# Function to calculate ROUGE for a single pair
def compute_rouge(model_df):
    # Initialize ROUGE scorer
    rouge_scores = []
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    for _, row in model_df.iterrows():
        # Determine reference text based on Dimension
        if row['Dimension'] in ['Uniqueness', 'Appreciation']:
            reference_text = row['firstTaskGoal'] if pd.notna(row['firstTaskGoal']) else row['addTaskGoals']
        else:
            reference_text = row['addFirstRelGoal'] if pd.notna(row['addFirstRelGoal']) else row['addRelGoals']  # Skip if Dimension doesn't match   
        # Get candidate text
        candidate_text = row['Selected_Text']
        
        # Skip if either text is missing
        if pd.isna(reference_text) or pd.isna(candidate_text):
            continue
        try:
            scores = scorer.score(reference_text, candidate_text)
            rouge_scores.append(scores)
        except:
            continue
    return rouge_scores
    
def read_and_process_file(file_path):
    """
    Read and process the input file.
    
    Args:
        file_path: Path to the input file (CSV or Excel)
        
    Returns:
        Processed DataFrame
    """
    # Read file based on extension
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith(('.xls', '.xlsx')):
        df = pd.read_excel(file_path, sheet_name='Final_Selection')
    else:
        raise ValueError("Unsupported file format. Please provide CSV or Excel file.")
    
    # Ensure required columns exist
    required_columns = ['model_name', 'Dimension', 'firstTaskGoal', 'addTaskGoals', 'addFirstRelGoal', 'addRelGoals', 'Selected_Text']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Input file is missing required column: {col}")
    
    return df


def main():
    # Example usage
    file_path = '/projects/humansVsLLMs/data/EMNLP-HUmanEvaluation.xlsx'  # Change to your file path
    try:
    # Read and process the file
        df = read_and_process_file(file_path)
        # Get unique models
        models = df['model_name'].unique()
        R1 = []
        R2 = []
        R3 = []
        for model in models:
            model_df = df[df['model_name'] == model]
            rouge_scores = compute_rouge(model_df)
            for i in range(len(rouge_scores)):
                r1 = rouge_scores[i]['rouge1'].fmeasure
                r2 = rouge_scores[i]['rouge2'].fmeasure
                rl = rouge_scores[i]['rougeL'].fmeasure
                R1.append(r1)
                R2.append(r2)
                R3.append(rl)
            print(f"Aggregated ROUGE Scores for model {model}:")
            print(f"R1: {s.mean(R1)} | R2: {s.mean(R2)} | RL: {s.mean(R3)}")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

# Lexicon Diversity

In [None]:
# Source: https://github.com/neural-dialogue-metrics/Distinct-N/tree/main/distinct_n

In [1]:
from itertools import chain
import pandas as pd

In [2]:

__all__ = ["ngrams"]


def pad_sequence(sequence, n, pad_left=False, pad_right=False,
                 left_pad_symbol=None, right_pad_symbol=None):
    """
    Returns a padded sequence of items before ngram extraction.

        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
        ['<s>', 1, 2, 3, 4, 5, '</s>']
        >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
        ['<s>', 1, 2, 3, 4, 5]
        >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
        [1, 2, 3, 4, 5, '</s>']

    :param sequence: the source data to be padded
    :type sequence: sequence or iter
    :param n: the degree of the ngrams
    :type n: int
    :param pad_left: whether the ngrams should be left-padded
    :type pad_left: bool
    :param pad_right: whether the ngrams should be right-padded
    :type pad_right: bool
    :param left_pad_symbol: the symbol to use for left padding (default is None)
    :type left_pad_symbol: any
    :param right_pad_symbol: the symbol to use for right padding (default is None)
    :type right_pad_symbol: any
    :rtype: sequence or iter
    """
    sequence = iter(sequence)
    if pad_left:
        sequence = chain((left_pad_symbol,) * (n - 1), sequence)
    if pad_right:
        sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
    return sequence


def ngrams(sequence, n, pad_left=False, pad_right=False,
           left_pad_symbol=None, right_pad_symbol=None):
    """
    Return the ngrams generated from a sequence of items, as an iterator.
    For example:

        >>> from nltk.util import ngrams
        >>> list(ngrams([1,2,3,4,5], 3))
        [(1, 2, 3), (2, 3, 4), (3, 4, 5)]

    Wrap with list for a list version of this function.  Set pad_left
    or pad_right to true in order to get additional ngrams:

        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True))
        [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)]
        >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>'))
        [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]
        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>'))
        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)]
        >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>'))
        [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')]


    :param sequence: the source data to be converted into ngrams
    :type sequence: sequence or iter
    :param n: the degree of the ngrams
    :type n: int
    :param pad_left: whether the ngrams should be left-padded
    :type pad_left: bool
    :param pad_right: whether the ngrams should be right-padded
    :type pad_right: bool
    :param left_pad_symbol: the symbol to use for left padding (default is None)
    :type left_pad_symbol: any
    :param right_pad_symbol: the symbol to use for right padding (default is None)
    :type right_pad_symbol: any
    :rtype: sequence or iter
    """
    sequence = pad_sequence(sequence, n, pad_left, pad_right,
                            left_pad_symbol, right_pad_symbol)

    history = []
    while n > 1:
        history.append(next(sequence))
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]

__all__ = ["distinct_n_sentence_level", "distinct_n_corpus_level"]


def distinct_n_sentence_level(sentence, n):
    """
    Compute distinct-N for a single sentence.
    :param sentence: a list of words.
    :param n: int, ngram.
    :return: float, the metric value.
    """
    n = 1
    if len(sentence) == 0:
        return 0.0  # Prevent a zero division
    distinct_ngrams = set(ngrams(sentence, n))
    return len(distinct_ngrams) / len(sentence)


def distinct_n_corpus_level(sentences, n):
    """
    Compute average distinct-N of a list of sentences (the corpus).
    :param sentences: a list of sentence.
    :param n: int, ngram.
    :return: float, the average value.
    """
    return sum(distinct_n_sentence_level(sentence, n) for sentence in sentences) / len(sentences)

In [None]:
#Human Text from Real-Life Leaders
df = pd.read_csv('/projects/humansVsLLMs/data/goals_leader_with_demographics.csv')
# Apply the function row-wise
sentence_scores = df['Leader_Action_Plans'].apply(lambda row: distinct_n_sentence_level(row, 2))
corpus_scores = df['Leader_Action_Plans'].apply(lambda row: distinct_n_sentence_level(row, 2))
print(f'distinct_n_sentence_level: {sentence_scores.mean()} | corpus_scores: {corpus_scores.mean()}')

In [None]:
# Model-Generated Texts
path = '/projects/humansVsLLMs/results/3-shot-generated-responses'
models = ['cohere', 'deepseek', 'gemini', 'gpt-4o-mini', 'llama', 'mistral', 'qwen']
for model in models:
    df = pd.read_csv(f'{path}/{model}_generated_responses.csv')
    # Apply the function row-wise
    sentence_scores = df['Response'].apply(lambda row: distinct_n_sentence_level(row, 1))
    corpus_scores = df['Response'].apply(lambda row: distinct_n_sentence_level(row, 1))
    print(f'distinct_n_sentence_level: {sentence_scores.mean()} | corpus_scores: {corpus_scores.mean()}')