In [3]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import math

In [5]:
df = pd.read_csv('Data_Post_Processing/llm_responses_all.csv')


# Marked Persona

In [None]:
def get_log_odds(df1, df2, df0, verbose=False, lower=True):
    """Calculate log-odds ratio for words in df1 (target group) vs df2 (non-target group)."""
    # Tokenize and count words
    if lower:
        counts1 = defaultdict(int, [[i, j] for i, j in df1.str.lower().str.split(expand=True).stack().replace('[^a-zA-Z\s]', '', regex=True).value_counts().items()])
        counts2 = defaultdict(int, [[i, j] for i, j in df2.str.lower().str.split(expand=True).stack().replace('[^a-zA-Z\s]', '', regex=True).value_counts().items()])
        prior = defaultdict(int, [[i, j] for i, j in df0.str.lower().str.split(expand=True).stack().replace('[^a-zA-Z\s]', '', regex=True).value_counts().items()])
    
    else:
        counts1 = defaultdict(int, [[i, j] for i, j in df1.str.split(expand=True).stack().replace('[^a-zA-Z\s]', '', regex=True).value_counts().items()])
        counts2 = defaultdict(int, [[i, j] for i, j in df2.str.split(expand=True).stack().replace('[^a-zA-Z\s]', '', regex=True).value_counts().items()])
        prior = defaultdict(int, [[i, j] for i, j in df0.str.split(expand=True).stack().replace('[^a-zA-Z\s]', '', regex=True).value_counts().items()])

    # Calculate log-odds ratio
    sigmasquared = defaultdict(float)
    sigma = defaultdict(float)
    delta = defaultdict(float)

    for word in prior.keys():
        prior[word] = int(prior[word] + 0.5)

    for word in counts2.keys():
        counts1[word] = int(counts1[word] + 0.5)
        if prior[word] == 0:
            prior[word] = 1

    for word in counts1.keys():
        counts2[word] = int(counts2[word] + 0.5)
        if prior[word] == 0:
            prior[word] = 1

    n1 = sum(counts1.values())
    n2 = sum(counts2.values())
    nprior = sum(prior.values())

    for word in prior.keys():
        if prior[word] > 0:
            l1 = float(counts1[word] + prior[word]) / ((n1 + nprior) - (counts1[word] + prior[word]))
            l2 = float(counts2[word] + prior[word]) / ((n2 + nprior) - (counts2[word] + prior[word]))
            sigmasquared[word] = 1 / (float(counts1[word]) + float(prior[word])) + 1 / (float(counts2[word]) + float(prior[word]))
            sigma[word] = math.sqrt(sigmasquared[word])
            delta[word] = (math.log(l1) - math.log(l2)) / sigma[word]

    if verbose:
        print("Top 10 words with highest log-odds (distinctive for target group):")
        for word in sorted(delta, key=delta.get, reverse=True)[:20]:
            print(f"{word}: {delta[word]:.3f}")

        print("\nTop 10 words with lowest log-odds (distinctive for non-target group):")
        for word in sorted(delta, key=delta.get)[:20]:
            print(f"{word}: {delta[word]:.3f}")

    return delta

In [21]:
def marked_words(df, target_race, non_target_race="White", verbose=False):
    """Identify words that distinguish the target race group from the non-target race group."""
    # Filter data for target and non-target groups
    target_df = df[df['Race'] == target_race]
    non_target_df = df[df['Race'] == non_target_race]

    # Calculate log-odds
    delta = get_log_odds(target_df['Response'], non_target_df['Response'], df['Response'], verbose=verbose)

    # Filter significant words (z-score > 1.96 or < -1.96)
    significant_words = {word: score for word, score in delta.items() if abs(score) > 1.96}

    return significant_words

In [None]:
# Define target races
target_races = [
    'African American or Black',
    'Asian',
    'Native Hawaiian or Other Pacific Islander',
    'American Indian or Alaska Native',
    'Hispanic or Latino'
]

# Calculate log-odds for each target group
results = {}
for race in target_races:
    print(f"Calculating log-odds for {race}...")
    results[race] = marked_words(df, target_race=race, verbose=True)
    print("\n")

# Semantic Surprisal Rate

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained model for semantic similarity
#model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-mpnet-base-v2')

df = pd.read_csv('Data_Post_Processing/Participants_Response_Clean.csv')

def calculate_surprisal(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    num_fragments = len(sentences)
    
    if num_fragments < 2:
        return 0  # No surprisal if there's only one sentence
    
    # Compute sentence embeddings
    embeddings = model.encode(sentences, convert_to_tensor=True)
    
    # Calculate semantic distances between consecutive sentences
    distances = []
    for i in range(1, num_fragments):
        distance = 1 - util.pytorch_cos_sim(embeddings[i], embeddings[i-1]).item()
        distances.append(distance)
    
    # Calculate the average semantic distance
    avg_distance = np.mean(distances)
    
    # Normalize the surprisal in the [0, 2] space
    surprisal = (2 / (num_fragments - 1)) * np.sum(distances)
    
    return surprisal

def calculate_surprisal_for_dataframe(df):
    # Apply the surprisal calculation to each row in the dataframe
    df['Surprisal'] = df['Response'].apply(calculate_surprisal)
    return df



# Calculate surprisal for each response
df = calculate_surprisal_for_dataframe(df)

# Group by race and calculate average surprisal for each race group
race_surprisal = df.groupby('Race')['Surprisal'].mean().reset_index()

# Print the results
print("Average Surprisal by Race:")
print(race_surprisal)

# Semantic Diversity

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

# Load the sentence embedding model
model = SentenceTransformer('thenlper/gte-large')

def semantic_distance(texts):
    """
    Compute the pairwise semantic distance matrix for a list of texts.
    """
    embeddings = model.encode(texts)
    similarity_matrix = cosine_similarity(embeddings)
    return 1 - similarity_matrix

def inverse_homogenization_score(responses, sample_size=None):
    """
    Calculate the inverse homogenization score for a set of responses.
    Optionally, sample a subset of responses to reduce computation.
    """
    if sample_size and len(responses) > sample_size:
        responses = np.random.choice(responses, size=sample_size, replace=False)
    
    n = len(responses)
    if n < 2:
        return 0  # Not enough responses to compute diversity
    
    distance_matrix = semantic_distance(responses)
    upper_triangle = np.triu(distance_matrix, k=1)
    average_distance = np.sum(upper_triangle) / (n * (n - 1) / 2)
    
    return average_distance

def calculate_semantic_diversity_by_race(df, race_column, response_column, sample_size=None):
    """
    Calculate the semantic diversity of responses for each race group in the dataset.
    
    Parameters:
    - df: DataFrame containing the data.
    - race_column: Name of the column containing race information.
    - response_column: Name of the column containing responses.
    - sample_size: Optional. Number of responses to sample for computation.
    
    Returns:
    - A dictionary with race groups as keys and their semantic diversity scores as values.
    """
    # Get unique race groups
    race_groups = df[race_column].unique()
    
    # Dictionary to store diversity scores
    diversity_scores = {}
    
    # Calculate semantic diversity for each race group
    for race in race_groups:
        # Filter responses for the current race group
        filtered_responses = df[df[race_column] == race][response_column].tolist()
        
        # Calculate semantic diversity
        diversity_score = inverse_homogenization_score(filtered_responses, sample_size)
        
        # Store the result
        diversity_scores[race] = diversity_score
    
    return diversity_scores


#Load the data

df = pd.read_csv('Data_Post_Processing/Participants_Response_Clean.csv')


# Calculate semantic diversity for each race group
diversity_scores = calculate_semantic_diversity_by_race(df, race_column='Race', response_column='Response')

# Print the results
for race, score in diversity_scores.items():
    print(f"Semantic diversity for {race}: {score*10:.4f}")

In [None]:
def calculate_semantic_diversity_whole_corpus(df, response_column, sample_size=None):
    """
    Calculate the semantic diversity of the whole corpus (all responses).
    
    Parameters:
    - df: DataFrame containing the data.
    - response_column: Name of the column containing responses.
    - sample_size: Optional. Number of responses to sample for computation.
    
    Returns:
    - Semantic diversity score for the whole corpus.
    """
    # Get all responses
    all_responses = df[response_column].tolist()
    
    # Calculate semantic diversity
    diversity_score = inverse_homogenization_score(all_responses, sample_size)
    
    return diversity_score
    
calculate_semantic_diversity_whole_corpus(df, response_column='Response')

In [10]:
df.to_csv('Data_Post_Processing/Participants_Response_Clean.csv')

# Novelty

In [None]:
df = pd.read_csv('Data_Post_Processing/Participants_Response_Clean.csv')

#Semantic Novelty
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all responses
df["Embedding"] = df["Response"].apply(lambda x: model.encode(x))

# Function to compute average pairwise distance
def avg_semantic_distance(embeddings):
    if len(embeddings) < 2:
        return 0.0  # Not enough stories to compare
    distances = []
    for emb1, emb2 in combinations(embeddings, 2):
        similarity = cosine_similarity([emb1], [emb2])[0][0]
        distances.append(1 - similarity)  # Convert similarity to distance
    return np.mean(distances)

# Calculate average distance for each race group
race_groups = df.groupby("Race")["Embedding"].apply(list)
race_distances = {race: avg_semantic_distance(embeddings) for race, embeddings in race_groups.items()}

# Calculate average distance for the entire corpus
corpus_distance = avg_semantic_distance(df["Embedding"].tolist())

# Compute novelty for each race group
novelty_scores = {race: 2 * abs(distance - corpus_distance) for race, distance in race_distances.items()}

print("Semantic Novelty by Race Group:")
for race, novelty in novelty_scores.items():
    print(f"Race {race}: {novelty:.3f}")

In [13]:
df.to_csv('Data_Post_Processing/Participants_Response_Clean.csv')

# Complexity

In [14]:
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from nltk.tokenize import word_tokenize
import nltk

df = pd.read_csv('Data_Post_Processing/Participants_Response_Clean.csv')


In [None]:
#TFIDG Semantic complexity
from sklearn.feature_extraction.text import TfidfVectorizer

# Compute TF-IDF for the entire corpus
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df["Response"])

# Average TF-IDF score per response (higher = more rare terms)
df["TFIDF_Complexity"] = np.array(X.mean(axis=1)).flatten()

# Group by race and average
tfidf_complexity = df.groupby("Race")["TFIDF_Complexity"].mean()
print("Semantic Complexity (TF-IDF):\n", tfidf_complexity)

In [None]:
#Word2Vec Semantic Complexity

# Preprocess text
def preprocess(text):
    return [word.lower() for word in word_tokenize(text) if word.isalpha()]

# Train Word2Vec on the corpus
sentences = [preprocess(text) for text in df["Response"]]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to compute semantic spread
def semantic_spread(text):
    words = preprocess(text)
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) < 2:
        return 0.0
    distances = [cosine(vectors[i], vectors[j]) for i in range(len(vectors)) for j in range(i+1, len(vectors))]
    return np.mean(distances) if distances else 0.0

# Compute for each response
df["Word2Vec_Complexity"] = df["Response"].apply(semantic_spread)

# Group by race
w2v_complexity = df.groupby("Race")["Word2Vec_Complexity"].mean()
print("Semantic Complexity (Word2Vec Spread):\n", w2v_complexity)

In [None]:
df["Combined_Complexity"] = 0.5 * (df["TFIDF_Complexity"] / df["TFIDF_Complexity"].max()) + \
                           0.5 * (df["Word2Vec_Complexity"] / df["Word2Vec_Complexity"].max())

group_complexity = df.groupby("Race")["Combined_Complexity"].mean()
print("Combined Semantic Complexity:\n", group_complexity)

In [20]:
df.to_csv('Data_Post_Processing/Participants_Response_Clean.csv')