In [None]:
from emoatlas import EmoScores
import os
import json
import re
from pprint import pprint
from collections import defaultdict
from emoatlas.resources import _valences
import random
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
from math import sqrt


emos=EmoScores()

# Importing dataset

In [None]:
base_output_dir = "localdb"
folder = 'math'
model = 'Haiku'

In [None]:
def extract_model(filename):
    match = re.search(r'_(.*?)_', filename)
    if match:
        return match.group(1)
    return None

In [None]:
def get_valence_sets(tfmn, language='english'):
    
    positive, negative, ambivalent = _valences(language)
    all_nodes = set(node for edge in tfmn for node in edge[:2])

    positive_nodes = all_nodes.intersection(positive)
    negative_nodes = all_nodes.intersection(negative)
    neutral_nodes = all_nodes - (positive_nodes | negative_nodes )

    return positive_nodes, negative_nodes, neutral_nodes

In [None]:
def swap_elements_in_sets(set1, set2, set3):
    # Calculate total number of elements
    total_elements = len(set1) + len(set2) + len(set3)
    
    # Convert sets to lists for easy manipulation
    lists = [list(set1), list(set2), list(set3)]
    
    # Perform swapping operations
    for _ in range(10 * total_elements):
        # Pick 2 random sets without repetition
        set_indices = random.sample(range(3), 2)
        
        # Pick a random word from each set
        word1 = random.choice(lists[set_indices[0]])
        word2 = random.choice(lists[set_indices[1]])
        
        # Only swap if the words don't already exist in the other list
        if word1 not in lists[set_indices[1]] and word2 not in lists[set_indices[0]]:
            lists[set_indices[0]].remove(word1)
            lists[set_indices[1]].remove(word2)
            lists[set_indices[0]].append(word2)
            lists[set_indices[1]].append(word1)
    
    # Convert lists back to sets
    return set(lists[0]), set(lists[1]), set(lists[2])

In [None]:
model_paths = os.path.join(base_output_dir, folder)
        
for filename in os.listdir(model_paths):
    print(filename.rstrip('.jsonl'))

In [None]:
input_folder_path = os.path.join(base_output_dir, folder, model+'.jsonl')


with open(input_folder_path, 'r') as file:
    fmnts=[]
    for line in file:
        data = json.loads(line)
        fmnts.append(data['fmnt']['syntactic'])
    weighted_fmnt = emos.combine_edgelists(fmnts)

    positive, negative, neutral = get_valence_sets(weighted_fmnt, language='english')
    random_positive, random_negative, random_neutral = swap_elements_in_sets(positive, negative, neutral)

    print(len(positive),positive)
    print(len(random_positive),random_positive)



In [None]:
input_folder_path = os.path.join(base_output_dir, folder, model+'.jsonl')

def get_random_edges():
    with open(input_folder_path, 'r') as file:
        fmnts=[]
        for line in file:
            data = json.loads(line)
            fmnts.append(data['fmnt']['syntactic'])
        weighted_fmnt = emos.combine_edgelists(fmnts)

        positive, negative, neutral = get_valence_sets(weighted_fmnt, language='english')
        random_positive, random_negative, random_neutral = swap_elements_in_sets(positive, negative, neutral)

        def get_random_valence(word):
                            if word in random_positive:
                                return 'positive'
                            elif word in random_negative:
                                return 'negative'
                            else:
                                return 'neutral'

                    
    # Weighted network analysis
    random_weights = {
        'pos_pos': 0, 'pos_neutral': 0, 'neg_neutral': 0,
        'neg_neg': 0, 'neutral_neutral': 0, 'pos_neg': 0,
        'weight_pos_pos': 0, 'weight_pos_neutral': 0, 'weight_neg_neutral': 0,
        'weight_neg_neg': 0, 'weight_neutral_neutral': 0, 'weight_pos_neg': 0
    }
    
    for node1, node2, weight in weighted_fmnt:
        random_valence1, random_valence2 = get_random_valence(node1), get_random_valence(node2)
    
        # Random distribution
        if random_valence1 == 'positive' and random_valence2 == 'positive':
            random_weights['pos_pos'] += 1
            random_weights['weight_pos_pos'] += weight
        elif (random_valence1 == 'positive' and random_valence2 == 'neutral') or (random_valence1 == 'neutral' and random_valence2 == 'positive'):
            random_weights['pos_neutral'] += 1
            random_weights['weight_pos_neutral'] += weight
        elif (random_valence1 == 'negative' and random_valence2 == 'neutral') or (random_valence1 == 'neutral' and random_valence2 == 'negative'):
            random_weights['neg_neutral'] += 1
            random_weights['weight_neg_neutral'] += weight
        elif random_valence1 == 'negative' and random_valence2 == 'negative':
            random_weights['neg_neg'] += 1
            random_weights['weight_neg_neg'] += weight
        elif random_valence1 == 'neutral' and random_valence2 == 'neutral':
            random_weights['neutral_neutral'] += 1
            random_weights['weight_neutral_neutral'] += weight
        elif (random_valence1 == 'negative' and random_valence2 == 'positive') or (random_valence1 == 'positive' and random_valence2 == 'negative'):
            random_weights['pos_neg'] += 1
            random_weights['weight_pos_neg'] += weight
        

        return(random_weights)
    
print(get_random_edges())


In [None]:
pos_pos_weights=[]

for i in range(100):
    pos_pos_weights.append(get_random_edges()['pos_pos'])
    if i % 50 == 0:
        print(i)

print(pos_pos_weights)

In [None]:

def calculate_stats(data):
    mean = np.mean(data)
    std = np.std(data, ddof=1)
    n = len(data)
    print(mean,std,n)

    # Z-value for 95% confidence
    z_value = 1.96

    # Calculate margin of error
    margin_of_error = z_value * (std / sqrt(n))

    # Calculate confidence interval
    confidence_interval = (mean - margin_of_error, mean + margin_of_error)

    print(f"Mean: {mean}")
    print(f"Standard Deviation: {std}")
    print(f"Sample Size: {n}")
    print(f"95% Confidence Interval: {confidence_interval}")




calculate_stats(pos_pos_weights)