In [9]:
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from collections import Counter
import regex as re
from nltk.sentiment import SentimentIntensityAnalyzer

In [8]:
#Download necessary NLTK data
#nltk.download()
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('averaged_perceptron_tagger_eng',quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab',quiet=True)
nltk.download('vader_lexicon',quiet=True)

True

In [10]:
# Emotion class definition
class Emotion:
    emotions = {}
    
    def __init__(self, name, parent_name=None):
        self.name = name
        self.parent = None
        self.level = 0
        self.children = []
        
        if parent_name:
            self.parent = Emotion.emotions[parent_name] if parent_name else None
            self.parent.children.append(self)
            self.level = self.parent.level + 1
    
    def __str__(self):
        return self.name

# Simplified WNAffect class
class WNAffect:
    def __init__(self):
        self.emotions = self._load_emotions()

    def _load_emotions(self):
        emotions = {
            "joy": Emotion("joy"),
            "sadness": Emotion("sadness"),
            "anger": Emotion("anger"),
            "fear": Emotion("fear"),
            "surprise": Emotion("surprise"),
            "disgust": Emotion("disgust")
        }
        return emotions

    def get_emotion(self, word, pos):
        synsets = wn.synsets(word)
        if synsets:
            for emotion, emotion_obj in self.emotions.items():
                if any(emotion in s.definition() for s in synsets):
                    return emotion_obj
        return None

In [11]:
# Define the file path
csv_file_path = r"data/Expert Human Evaluation - Sheet1.csv"

# Check if the CSV file exists
if not os.path.exists(csv_file_path):
    print(f"Error: The file {csv_file_path} does not exist.")
    exit()

# Initialize WNAffect
wna = WNAffect()

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Rename columns for clarity
df.columns = ['Group', 'Sender', 'Message'] + list(df.columns[3:])

# Create a new column for combined groups
df['Combined_Group'] = df['Group'].map({'A': 'AB', 'B': 'AB', 'C': 'CD', 'D': 'CD', 'FAQ': 'FAQ'})

def analyze_emotions(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    
    emotions = []
    for word, pos in pos_tags:
        emotion = wna.get_emotion(word, pos)
        if emotion:
            emotions.append(emotion)
    
    emotion_count = Counter(str(e) for e in emotions)
    total_emotions = sum(emotion_count.values())
    
    return {
        'emotion_count': dict(emotion_count),
        'total_emotions': total_emotions,
        'emotion_diversity': len(emotion_count) / total_emotions if total_emotions > 0 else 0,
        'top_emotions': emotion_count.most_common(3)
    }

# Apply emotion analysis to each message
df['emotion_analysis'] = df['Message'].apply(analyze_emotions)

# Function to calculate group statistics
def group_statistics(group):
    stats = {}
    
    # Aggregate emotion counts
    all_emotions = Counter()
    for item in group['emotion_analysis']:
        all_emotions.update(item['emotion_count'])
    
    stats['total_emotions'] = sum(all_emotions.values())
    stats['unique_emotions'] = len(all_emotions)
    stats['top_emotions'] = all_emotions.most_common(5)
    stats['emotion_diversity'] = len(all_emotions) / stats['total_emotions'] if stats['total_emotions'] > 0 else 0
    
    # Average emotion counts per message
    stats['avg_emotions_per_message'] = stats['total_emotions'] / len(group)
    
    return stats

In [12]:
# Calculate statistics for each combined group
group_stats = df.groupby('Combined_Group').apply(group_statistics)
# Print results
for group, stats in group_stats.items():
    print(f"\nStatistics for Group {group}:")
    print(f"  Total emotions: {stats['total_emotions']}")
    print(f"  Unique emotions: {stats['unique_emotions']}")
    print(f"  Emotion diversity: {stats['emotion_diversity']:.4f}")
    print(f"  Average emotions per message: {stats['avg_emotions_per_message']:.2f}")
    print("  Top 5 emotions:")
    for emotion, count in stats['top_emotions']:
        print(f"    {emotion}: {count}")



Statistics for Group AB:
  Total emotions: 62
  Unique emotions: 3
  Emotion diversity: 0.0484
  Average emotions per message: 0.94
  Top 5 emotions:
    anger: 36
    joy: 22
    surprise: 4

Statistics for Group CD:
  Total emotions: 32
  Unique emotions: 3
  Emotion diversity: 0.0938
  Average emotions per message: 0.49
  Top 5 emotions:
    anger: 28
    surprise: 2
    joy: 2

Statistics for Group FAQ:
  Total emotions: 26
  Unique emotions: 3
  Emotion diversity: 0.1154
  Average emotions per message: 0.72
  Top 5 emotions:
    anger: 23
    joy: 2
    surprise: 1


  group_stats = df.groupby('Combined_Group').apply(group_statistics)


In [13]:
# Optional: Save results to CSV
result_df = pd.DataFrame.from_dict(group_stats.to_dict(), orient='index')
result_df.to_csv('results/emotion_analysis_results_combined.csv')

In [14]:
####Linguistic rules/structure based on theory of linguistic empathy (syntax and rhetoric)
## Empathy Rules
# Person form
def score_person_form(text):
    first_person_plural = len([word for word in text.split() if word.lower() in ["we", "us", "our"]])
    second_person = len([word for word in text.split() if word.lower() in ["you", "your"]])
    return first_person_plural + second_person

# Pronouns
def score_pronouns(text):
    tokens = word_tokenize(text)
    pronouns = [word for word, pos in pos_tag(tokens) if pos == 'PRP']
    return len(pronouns)

# Tense
def score_tense(text):
    tokens = word_tokenize(text)
    present_tense_verbs = len([word for word, pos in pos_tag(tokens) if pos in ['VBP', 'VBZ']])
    return present_tense_verbs

# Exclamations
def score_exclamations(text):
    exclamations = text.count('!')
    return exclamations

# Stimulating Dialogue
def score_stimulating_dialogue(text):
    stimulating_phrases = [
        r"\bshall we\b", r"\bhow about\b", r"could you please share",
        r"what are your thoughts on\b", r"\bwhat do you think about\b",
        r"\bwhy don't we\b", r"\bhave you considered\b"
    ]
    # Use regex to find matches and count them
    return sum(len(re.findall(phrase, text.lower())) for phrase in stimulating_phrases)


def score_acknowledging(text):
    acknowledging_phrases = [
        r"\bthank you for\b", r"\bthis is helpful\b", r"\bI appreciate\b",
        r"\bgood point\b", r"\bthat's a great idea\b", r"\bI understand\b",
        r"\bthanks for sharing\b"
    ]
    # Use regex to find matches and count them
    return sum(len(re.findall(phrase, text.lower())) for phrase in acknowledging_phrases)


def score_collective_reasoning(text):
    reasoning_phrases = [
        r"\bthinking together\b", r"\blet us think this through\b",
        r"\bas a team\b", r"\bworking together\b", r"\bjoin our heads\b",
        r"\bcollectively consider\b", r"\bmutual understanding\b"
    ]
    # Use regex to find matches and count them
    return sum(len(re.findall(phrase, text.lower())) for phrase in reasoning_phrases)

# Imperative Statements
def score_imperative_statements(text):
    tokens = word_tokenize(text)
    imperatives = [word for word, pos in pos_tag(tokens) if pos == 'VB' or (pos == 'VBP' and word == 'please')]
    return len(imperatives)

# Interim Questioning
def score_interim_questioning(text):
    interim_questions = text.count('?')
    return interim_questions

# Caring Statements
sia = SentimentIntensityAnalyzer()
def score_caring_statements(text):
    # This uses Sentiment Intensity Analyzer to check for positive sentiment as a proxy for affective statements
    sentiment = sia.polarity_scores(text)
    return sentiment['pos']



In [102]:
#Rule #1: 
df['person_form'] = df['Message'].apply(score_person_form)
df.groupby('Combined_Group')['person_form'].describe()



Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,2.893939,1.772688,0.0,2.0,3.0,4.0,9.0
CD,65.0,0.184615,0.527148,0.0,0.0,0.0,0.0,2.0
FAQ,36.0,0.111111,0.464621,0.0,0.0,0.0,0.0,2.0


In [103]:
#Rule #2: 
df['pronouns'] = df['Message'].apply(score_pronouns)
df.groupby('Combined_Group')['pronouns'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,3.287879,2.058866,0.0,2.0,3.0,4.0,12.0
CD,65.0,0.569231,1.198557,0.0,0.0,0.0,1.0,6.0
FAQ,36.0,0.388889,1.049565,0.0,0.0,0.0,0.25,6.0


In [50]:
df['tense'] = df['Message'].apply(score_pronouns)
df.groupby('Combined_Group')['tense'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,3.287879,2.058866,0.0,2.0,3.0,4.0,12.0
CD,65.0,0.569231,1.198557,0.0,0.0,0.0,1.0,6.0
FAQ,36.0,0.388889,1.049565,0.0,0.0,0.0,0.25,6.0


In [51]:
df['exclamations'] = df['Message'].apply(score_pronouns)
df.groupby('Combined_Group')['exclamations'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,3.287879,2.058866,0.0,2.0,3.0,4.0,12.0
CD,65.0,0.569231,1.198557,0.0,0.0,0.0,1.0,6.0
FAQ,36.0,0.388889,1.049565,0.0,0.0,0.0,0.25,6.0


In [96]:
df['simulations'] = df['Message'].apply(score_stimulating_dialogue)
df.groupby('Combined_Group')['simulations'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,0.060606,0.240435,0.0,0.0,0.0,0.0,1.0
CD,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAQ,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
df['acknowledge'] = df['Message'].apply(score_acknowledging)
df.groupby('Combined_Group')['acknowledge'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,0.166667,0.375534,0.0,0.0,0.0,0.0,1.0
CD,65.0,0.030769,0.174036,0.0,0.0,0.0,0.0,1.0
FAQ,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [98]:
df['collective reasoning'] = df['Message'].apply(score_collective_reasoning)
df.groupby('Combined_Group')['collective reasoning'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CD,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAQ,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
df['imperative'] = df['Message'].apply(score_imperative_statements)
df.groupby('Combined_Group')['imperative'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,3.212121,2.317261,0.0,2.0,3.0,5.0,9.0
CD,65.0,1.553846,1.50032,0.0,0.0,1.0,2.0,5.0
FAQ,36.0,1.305556,1.260826,0.0,0.0,1.0,2.0,6.0


In [90]:
df['interim questioning'] = df['Message'].apply(score_interim_questioning)
df.groupby('Combined_Group')['interim questioning'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,0.530303,0.502905,0.0,0.0,1.0,1.0,1.0
CD,65.0,0.523077,0.562019,0.0,0.0,0.0,1.0,2.0
FAQ,36.0,0.194444,0.401386,0.0,0.0,0.0,0.0,1.0


In [16]:
df['caring'] = df['Message'].apply(score_caring_statements)
df.groupby('Combined_Group')['caring'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Combined_Group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AB,66.0,0.218939,0.125743,0.0,0.108,0.1995,0.32075,0.516
CD,65.0,0.140615,0.116219,0.0,0.0,0.156,0.215,0.385
FAQ,36.0,0.078556,0.102656,0.0,0.0,0.055,0.12025,0.444
