In [None]:
from emoatlas import EmoScores
import os
import json
import re
from pprint import pprint
from collections import defaultdict
from emoatlas.resources import _valences
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats



emos=EmoScores()

In [None]:
base_output_dir = "localdb"
valence_dir = "Valence_Metrics"

# Folders of interest
folders_of_interest = ['climate', 'math', 'misinformation_health','gwarming']
folders_of_interest = ['math']

In [None]:
def extract_model(filename):
    match = re.search(r'_(.*?)_', filename)
    if match:
        return match.group(1)
    return None

In [None]:
def get_valence_sets(weighted_fmnt, language='english'):
    
    positive, negative, ambivalent = _valences(language)

    all_nodes = set(node for edge in weighted_fmnt for node in edge[:2])

    positive_nodes = all_nodes.intersection(positive)
    negative_nodes = all_nodes.intersection(negative)
    neutral_nodes = all_nodes - (positive_nodes | negative_nodes )

    return positive_nodes, negative_nodes, neutral_nodes

In [None]:
def swap_elements_in_sets(set1, set2, set3):
    # Calculate total number of elements
    total_elements = len(set1) + len(set2) + len(set3)
    
    # Convert sets to lists for easy manipulation
    lists = [list(set1), list(set2), list(set3)]
    
    # Perform swapping operations
    for _ in range(10 * total_elements):
        # Pick 2 random sets without repetition
        set_indices = random.sample(range(3), 2)
        
        # Pick a random word from each set
        word1 = random.choice(lists[set_indices[0]])
        word2 = random.choice(lists[set_indices[1]])
        
        # Only swap if the words don't already exist in the other list
        if word1 not in lists[set_indices[1]] and word2 not in lists[set_indices[0]]:
            lists[set_indices[0]].remove(word1)
            lists[set_indices[1]].remove(word2)
            lists[set_indices[0]].append(word2)
            lists[set_indices[1]].append(word1)
    
    # Convert lists back to sets
    return set(lists[0]), set(lists[1]), set(lists[2])

In [None]:
for folder in folders_of_interest:
        input_folder_path = os.path.join(base_output_dir, folder)
        valence_output_path = os.path.join(valence_dir, folder)

        for filename in os.listdir(input_folder_path):
            print(filename.rstrip('.jsonl'))

            file_path = os.path.join(input_folder_path, filename)
            with open(file_path, 'r') as file:
                fmnts=[]
                for line in file:
                    data = json.loads(line)
                    fmnts.append(data['fmnt']['syntactic'])
            
            weighted_fmnt = emos.combine_edgelists(fmnts)
            if ('ITA') in filename:
                  language='italian'
            else:
                  language='english'
                  
                  
            
            positive, negative, neutral = get_valence_sets(weighted_fmnt, language=language)
            random_positive, random_negative, random_neutral = swap_elements_in_sets(positive, negative, neutral)
            print(len(positive),len(negative),len(neutral))
            print(len(random_positive),len(random_negative),len(random_neutral))
            print(positive)
            print(random_positive)

            def get_random_valence(word):
                        if word in random_positive:
                            return 'positive'
                        elif word in random_negative:
                            return 'negative'
                        else:
                            return 'neutral'
                        
            # Weighted network analysis
            random_weights = {
                'pos_pos': 0,
                'neg_neg': 0, 'pos_neg': 0,
            }
        
            for node1, node2, weight in weighted_fmnt:
                random_valence1, random_valence2 = get_random_valence(node1), get_random_valence(node2)
        
                # Random distribution
                if random_valence1 == 'positive' and random_valence2 == 'positive':
                    random_weights['pos_pos'] += 1
                elif random_valence1 == 'negative' and random_valence2 == 'negative':
                    random_weights['neg_neg'] += 1
                elif (random_valence1 == 'negative' and random_valence2 == 'positive') or (random_valence1 == 'positive' and random_valence2 == 'negative'):
                    random_weights['pos_neg'] += 1

            print(random_weights)
            break

In [None]:
def export_loop_valences():
    #positive, negative, neutral = get_valence_sets(weighted_network, language=language)
    for folder in folders_of_interest:
        input_folder_path = os.path.join(base_output_dir, folder)
        valence_output_path = os.path.join(valence_dir, folder)

        for filename in os.listdir(input_folder_path):
            print(filename.rstrip('.jsonl'))

            file_path = os.path.join(input_folder_path, filename)
            with open(file_path, 'r') as file:
                fmnts=[]
                for line in file:
                    data = json.loads(line)
                    fmnts.append(data['fmnt']['syntactic'])
            
            weighted_fmnt = emos.combine_edgelists(fmnts)
            if ('ITA') in filename:
                  language='italian'
            else:
                  language='english'
                  
                  
            
            positive, negative, neutral = get_valence_sets(weighted_fmnt, language=language)

            output_file_path = os.path.join(valence_output_path, f"{filename.rstrip('.jsonl')}_valence.jsonl")
            with open(output_file_path, 'a') as out_file:
                for i in range(1000):
                    if i % 50 == 0:
                        print(i)
                    random_positive, random_negative, random_neutral = swap_elements_in_sets(positive, negative, neutral)
                    
                    def get_random_valence(word):
                        if word in random_positive:
                            return 'positive'
                        elif word in random_negative:
                            return 'negative'
                        else:
                            return 'neutral'
                    
                    # Weighted network analysis
                    random_weights = {
                        'pos_pos': 0, 'pos_neutral': 0, 'neg_neutral': 0,
                        'neg_neg': 0, 'neutral_neutral': 0, 'pos_neg': 0,
                        'weight_pos_pos': 0, 'weight_pos_neutral': 0, 'weight_neg_neutral': 0,
                        'weight_neg_neg': 0, 'weight_neutral_neutral': 0, 'weight_pos_neg': 0
                    }
        
                    for node1, node2, weight in weighted_fmnt:
                        random_valence1, random_valence2 = get_random_valence(node1), get_random_valence(node2)
            
                        # Random distribution
                        if random_valence1 == 'positive' and random_valence2 == 'positive':
                            random_weights['pos_pos'] += 1
                            random_weights['weight_pos_pos'] += weight
                        elif (random_valence1 == 'positive' and random_valence2 == 'neutral') or (random_valence1 == 'neutral' and random_valence2 == 'positive'):
                            random_weights['pos_neutral'] += 1
                            random_weights['weight_pos_neutral'] += weight
                        elif (random_valence1 == 'negative' and random_valence2 == 'neutral') or (random_valence1 == 'neutral' and random_valence2 == 'negative'):
                            random_weights['neg_neutral'] += 1
                            random_weights['weight_neg_neutral'] += weight
                        elif random_valence1 == 'negative' and random_valence2 == 'negative':
                            random_weights['neg_neg'] += 1
                            random_weights['weight_neg_neg'] += weight
                        elif random_valence1 == 'neutral' and random_valence2 == 'neutral':
                            random_weights['neutral_neutral'] += 1
                            random_weights['weight_neutral_neutral'] += weight
                        elif (random_valence1 == 'negative' and random_valence2 == 'positive') or (random_valence1 == 'positive' and random_valence2 == 'negative'):
                            random_weights['pos_neg'] += 1
                            random_weights['weight_pos_neg'] += weight
                        
            
            
                    json.dump(random_weights, out_file)
                    out_file.write('\n')
                    out_file.flush()
                    
export_loop_valences()

# Utilities

In [None]:
def valence_metrics(weighted_network, model, topic, language='english'):
    
    positive, negative, neutral = get_valence_sets(weighted_network, language=language)
    random_positive, random_negative, random_neutral = swap_elements_in_sets(positive, negative, neutral)

    def get_valence(word):
        if word in positive:
            return 'positive'
        elif word in negative:
            return 'negative'
        else:
            return 'neutral'
        
    def get_random_valence(word):
        if word in random_positive:
            return 'positive'
        elif word in random_negative:
            return 'negative'
        else:
            return 'neutral'

    # Weighted network analysis
    weight_pos_pos, weight_pos_neutral, weight_pos_neg,weight_neg_neutral, weight_neg_neg, weight_neutral_neutral = 0, 0, 0, 0, 0, 0
    random_weight_pos_pos, random_weight_pos_neutral, random_weight_pos_neg, random_weight_neg_neutral, random_weight_neg_neg, random_weight_neutral_neutral = 0, 0, 0, 0, 0, 0
    
    for node1, node2, weight in weighted_network:
        valence1, valence2 = get_valence(node1), get_valence(node2)
        random_valence1, random_valence2 = get_random_valence(node1), get_random_valence(node2)
        
        # Normal distribution
        if valence1 == 'positive' and valence2 == 'positive':
            weight_pos_pos += weight
        elif (valence1 == 'positive' and valence2 == 'neutral') or (valence1 == 'neutral' and valence2 == 'positive'):
            weight_pos_neutral += weight
        elif (valence1 == 'negative' and valence2 == 'neutral') or (valence1 == 'neutral' and valence2 == 'negative'):
            weight_neg_neutral += weight
        elif valence1 == 'negative' and valence2 == 'negative':
            weight_neg_neg += weight
        elif valence1 == 'neutral' and valence2 == 'neutral':
            weight_neutral_neutral += weight
        elif (valence1 == 'negative' and valence2 == 'positive') or (valence1 == 'positive' and valence2 == 'negative'):
            weight_pos_neg += weight
        
        # Random distribution
        if random_valence1 == 'positive' and random_valence2 == 'positive':
            random_weight_pos_pos += weight
        elif (random_valence1 == 'positive' and random_valence2 == 'neutral') or (random_valence1 == 'neutral' and random_valence2 == 'positive'):
            random_weight_pos_neutral += weight
        elif (random_valence1 == 'negative' and random_valence2 == 'neutral') or (random_valence1 == 'neutral' and random_valence2 == 'negative'):
            random_weight_neg_neutral += weight
        elif random_valence1 == 'negative' and random_valence2 == 'negative':
            random_weight_neg_neg += weight
        elif random_valence1 == 'neutral' and random_valence2 == 'neutral':
            random_weight_neutral_neutral += weight
        elif (random_valence1 == 'negative' and random_valence2 == 'positive') or (random_valence1 == 'positive' and random_valence2 == 'negative'):
            random_weight_pos_neg += weight
    
    print("\nWeighted Network Metrics:")
    print(f"Total weight of edges between positive nodes: {weight_pos_pos}")
    print(f"Total weight of edges between positive and neutral nodes: {weight_pos_neutral}")
    print(f"Total weight of edges between negative and neutral nodes: {weight_neg_neutral}")
    print(f"Total weight of edges between negative nodes: {weight_neg_neg}")
    print(f"Total weight of edges between neutral nodes: {weight_neutral_neutral}")
    print(f"Total weight of edges between contrasting nodes: {weight_pos_neg}")
    
    print("\nRandom Distribution Metrics:")
    print(f"Total weight of edges between positive nodes: {random_weight_pos_pos}")
    print(f"Total weight of edges between positive and neutral nodes: {random_weight_pos_neutral}")
    print(f"Total weight of edges between negative and neutral nodes: {random_weight_neg_neutral}")
    print(f"Total weight of edges between negative nodes: {random_weight_neg_neg}")
    print(f"Total weight of edges between neutral nodes: {random_weight_neutral_neutral}")
    print(f"Total weight of edges between contrasting nodes: {random_weight_pos_neg}")
    
    # Calculate z-scores
    normal_weights = [weight_pos_pos, weight_pos_neutral, weight_neg_neutral, weight_neg_neg, weight_neutral_neutral,weight_pos_neg]
    random_weights = [random_weight_pos_pos, random_weight_pos_neutral, random_weight_neg_neutral, random_weight_neg_neg, random_weight_neutral_neutral,random_weight_pos_neg]
    
    z_scores = stats.zscore(normal_weights + random_weights)[:len(normal_weights)]
    
    print("\nZ-scores (Normal vs Random):")
    for label, z_score in zip(['Positive-Positive', 'Positive-Neutral', 'Negative-Neutral', 'Negative-Negative', 'Neutral-Neutral','Positive-Negative'], z_scores):
        print(f"{label}: {z_score:.2f}")
    
    # Plotting
    labels = ['Positive-Positive', 'Positive-Neutral', 'Negative-Neutral', 'Negative-Negative', 'Neutral-Neutral','Positive-Negative']
    normal_sizes = normal_weights
    random_sizes = random_weights
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
    
    ax1.pie(normal_sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    ax1.set_title('Normal Valence Distribution')
    
    ax2.pie(random_sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    ax2.set_title('Random Valence Distribution')
    
    plt.suptitle(f'Distribution of Edge Weights by Node Valence for model {model} and topic {topic}: Normal vs Random')
    plt.show()
# Example usage:

#valence_metrics(w_fmnt,'Haiku','climate')


In [None]:

for folder in folders_of_interest:
    folder_path = os.path.join(base_output_dir, folder)

    for filename in os.listdir(folder_path):
        #print(filename.rstrip('.jsonl'))

        if '(ITA)' in filename:
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r') as file:
                
                fmnts=[]

                for line in file:
                    data = json.loads(line)
                    fmnts.append(data['fmnt']['syntactic'])
                weighted_fmnt = emos.combine_edgelists(fmnts)
                print(len(weighted_fmnt))
                print(weighted_fmnt[0])
                valence_metrics(weighted_fmnt,filename.rstrip('.jsonl'),folder,language='italian')
    