In [None]:
from emoatlas import EmoScores
import os
import json
import re
from pprint import pprint
import pymongo
import networkx as nx
from collections import defaultdict
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from emoatlas.resources import _valences


In [None]:
# Define the base directory containing the folders of interest
base_dir = 'localdb'

# Folders of interest
folders_of_interest = ['math']
model1='Haiku'


emos = EmoScores()
emosita = EmoScores('italian')

language='eng'

In [None]:
models_data={}

for folder in folders_of_interest:
    folder_path = os.path.join(base_dir, folder)

    for filename in os.listdir(folder_path):
        
        fmnts=[]
        texts=[]

        file_path = os.path.join(folder_path, filename)
        
        if language=='ita':
            if '(ITA)' in file_path:
                with open(file_path, 'r') as file:
                    for line in file:
                        json_obj = json.loads(line)
                        fmnts.append(json_obj['fmnt']['syntactic'])
                        texts.append(' '.join(json_obj['lemmatized_test']))
                models_data[filename.rstrip('.jsonl')] = { 'Network': emosita.combine_edgelists(fmnts),
                                         'Texts': ' '.join(texts)
                }              
        if language=='eng':
            if '(ITA)' not in file_path:
                with open(file_path, 'r') as file:
                    for line in file:
                        json_obj = json.loads(line)
                        fmnts.append(json_obj['fmnt']['syntactic'])
                        texts.append(' '.join(json_obj['lemmatized_test']))
                models_data[filename.rstrip('.jsonl')] = { 'Network': emos.combine_edgelists(fmnts),
                                         'Texts': ' '.join(texts)
                }              


In [None]:
print(models_data.keys())



retenonpesata = [(i[0],i[1]) for i in models_data[model1]['Network']]
retepesata = [(i[0],i[1]) for i in models_data[model1]['Network'] if i[2]>49]
print(retepesata)


# Create a directed graph using networkx
G = nx.Graph()
G.add_edges_from(retepesata)
if "anxiety" in G:
    degree = G.degree("anxiety")
    print(f'The degree of the word "anxiety" is: {degree}')
else:
    print('"anxiety" is not in the graph.')

Gfm=emos.nxgraph_to_formamentis(G)
   
fmnt_you = emos.extract_word_from_formamentis(Gfm, 'anxiety')
emos.draw_formamentis(fmnt_you,alpha_syntactic=0.4,thickness=0.7)
# Therapist
# Non presente in Haiku
#The max diameter of the network is: 4
#The shortest path length to these nodes is: 3

# GPT - 3.5:
#The max diameter of the network is: 5
#The furthest nodes from 'math' are: ['post', 'secondary', 'size', 'therapist']
#The shortest path length to these nodes is: 3

#Llama-3-8B
#The max diameter of the network is: 8
#The furthest nodes from 'math' are: ['demetrovics']
#The shortest path length to these nodes is: 5

#The max diameter of the network is: 9
#The furthest nodes from 'math' are: ['finkelstein']
#The shortest path length to these nodes is: 5


In [None]:
settone=set()
for i in models_data[model1]['Network']:
    settone.add(i[0])
    settone.add(i[1])
print(settone)
print('therapist' in settone)

In [None]:

retenonpesata = [(i[0],i[1]) for i in models_data[model1]['Network']]
retepesata = [(i[0],i[1]) for i in models_data[model1]['Network'] if i[2]>50]
print(retepesata)


# Create a directed graph using networkx
G = nx.Graph()
G.add_edges_from(retepesata)

# Finding all the connected components of the graph
components = list(nx.connected_components(G))

# Calculating the diameter for each connected component
diameters = []
for component in components:
    subgraph = G.subgraph(component)
    diameters.append(nx.diameter(subgraph))

# The diameter of the graph is the maximum diameter among its connected components
max_diameter = max(diameters)

def get_furthest_nodes(G, source_node):
    # Calculate shortest path lengths from the source node to all other nodes
    path_lengths = nx.single_source_shortest_path_length(G, source_node)
    
    # Find the maximum path length
    max_length = max(path_lengths.values())
    
    # Get all nodes with the maximum path length
    furthest_nodes = [node for node, length in path_lengths.items() if length == max_length]
    almost_furthest_nodes = [node for node, length in path_lengths.items() if length == max_length-1]
    print(f"The furthest nodes from '{source_node}' are: {almost_furthest_nodes}")

    return furthest_nodes, max_length

# Example usage:
# Assuming you have a graph G

# Find the furthest nodes from "math"
source_node = "math"
furthest_nodes, max_length = get_furthest_nodes(G, source_node)

print(f"The max diameter of the network is: {max_diameter}")
print(f"The furthest nodes from '{source_node}' are: {furthest_nodes}")
print(f"The shortest path length to these nodes is: {max_length}")


In [None]:
retepesatapermindstream = [(i[0],i[1],i[2]) for i in models_data[model1]['Network'] if i[2]>50]

settone=set()
for i in retepesatapermindstream:
    settone.add(i[0])
    settone.add(i[1])
print(settone)
print('therapist' in settone)

In [None]:
for i in retepesatapermindstream:
    if i[0]=='therapist' or i[1]=='therapist':
        print(i)

In [None]:
emos.plot_mindset_stream(retepesatapermindstream,'math','therapist')

In [None]:
retenonpesata = [(i[0],i[1]) for i in models_data[model1]['Network']]


# Create a directed graph using networkx
G = nx.Graph()
G.add_edges_from(retenonpesata)

In [None]:
Gfm=emos.nxgraph_to_formamentis(G)
fmnt_you = emos.extract_word_from_formamentis(Gfm, 'medium')
emos.draw_statistically_significant_emotions(fmnt_you,title=f'Emotion detection - Medium, {model1}')
fmnt_you = emos.extract_word_from_formamentis(Gfm, 'social')
emos.draw_statistically_significant_emotions(fmnt_you,title=f'Emotion detection - Social, {model1}')

In [None]:
%\begin{figure}[!htbp]
%    \centering
%    \begin{subfigure}[t]{0.49\textwidth}
%        \centering
%        \captionsetup{justification=centering}
%        \includegraphics[width=\linewidth, trim=0.25cm 0.25cm 0.25cm 1.25cm, clip]{figures/mistraltherapist.png}
%        \caption{Math Anxiety - Mindset Stream of \textsc{\char13}Math-Therapist\textsc{\char13} in the combined TFMN of Mistral. \textbf{Only the top 0.05\% quantile of shortest path weights are shown.}}
%    \end{subfigure}
%    \begin{subfigure}[t]{0.49\textwidth}
%        \centering
%        \captionsetup{justification=centering}
%        \includegraphics[width=\linewidth, trim=0.25cm 0.25cm 0.25cm 1.25cm, clip]{figures/llamatherapist.png}
%        \caption{Math Anxiety - Mindset Stream of \textsc{\char13}Math-Therapist\textsc{\char13} in the combined TFMN of LLama. All shortest paths are shown.}
%    \end{subfigure}
%    
%    \begin{subfigure}[t]{0.49\textwidth}
%        \centering
%        \captionsetup{justification=centering}
%        \includegraphics[width=\linewidth, trim=0.25cm 0.25cm 0.25cm 1.25cm, clip]{figures/mindsettherapist.png}
%        {Mindset stream of \textsc{\char13}math-therapist\textsc{\char13} in the combined TFMN of GPT-3.5. All shortest paths are shown.}
%    \end{subfigure}
%    \caption{{Math Anxiety - Mindset Stream of \textsc{\char13}Math-Therapist\textsc{\char13} in the combined TFMN of various LLMs.}}
%    \label{fig:mindsetmath}
%\end{figure}
%

In [None]:
retepesatafiltrata = [(i[0],i[1],i[2]) for i in models_data[model1]['Network'] if i[2]>=1]
retepesatafiltrata


In [None]:
emos.plot_mindset_stream(retepesatafiltrata,'math','therapist',top_quantile=0.05)

In [None]:
emos.plot_mindset_stream(retenonpesata,'math','therapist')

In [None]:
def calculate_path_weight(network, path):
    weight = 0
    for i in range(len(path) - 1):
        for edge in network:
            if (edge[0] == path[i] and edge[1] == path[i+1]) or (edge[1] == path[i] and edge[0] == path[i+1]):
                weight += edge[2]
                break
    return weight

In [None]:
def analyze_paths(network, start_node, end_node):

    all_paths = emos.find_all_shortest_paths(network,start_node,end_node)
    
    path_weights = []
    for path in all_paths:
        weight = calculate_path_weight(network, path)
        path_weights.append((path, weight))
    
    df = pd.DataFrame(path_weights, columns=['Path', 'Total Weight'])
    df['Path'] = df['Path'].astype(str)
    return df

In [None]:
def plot_path_weights(df):
    # Sort the dataframe by 'Total Weight' in descending order
    df_sorted = df.sort_values('Total Weight', ascending=False)

    sns.set_style("whitegrid")
    # Create the horizontal bar plot
    plt.figure(figsize=(10, 12))  # Adjusted figure size for better visibility
    ax = sns.barplot(y='Path', x='Total Weight', data=df_sorted, palette='viridis_r', orient='h')

    # Customize the plot
    plt.title('Total Shortest Path Weights', fontsize=16)
    plt.xlabel('Total Path Weight', fontsize=12)
    plt.ylabel('Path', fontsize=12)

    # Make y-axis labels smaller
    plt.yticks(fontsize=8)

    # Add value labels at the end of each bar
    for i, v in enumerate(df_sorted['Total Weight']):
        ax.text(v, i, f' {v}', va='center', fontsize=8)

    # Add a line separating the top quartile
    n = len(df_sorted)
    quartile_index = int(0.25 * n)  # Changed to 0.25 as the order is now descending
    quartile_value = df_sorted['Total Weight'].iloc[quartile_index]
    plt.axvline(x=quartile_value, color='black', linestyle='--', linewidth=1)

    # Add text annotation for the quartile line
    plt.text(quartile_value + (quartile_value * 0.2), n-n-1.5, 'Top Quartile', ha='center', va='top', color='black', fontsize=14, rotation=360)

    # Adjust layout and show the plot
    plt.tight_layout()
    plt.show()

In [None]:
df = analyze_paths(weighted_network,'human','future')

plot_path_weights(df)

In [None]:
emos.plot_mindset_stream(weighted_network,'human','future')


In [None]:
def get_top_quantile_paths(network, start_node, end_node, top_quantile=0.25):
    all_paths = emos.find_all_shortest_paths(network, start_node, end_node)
    
    path_weights = []
    for path in all_paths:
        weight = calculate_path_weight(network, path)
        path_weights.append((path, weight))
    
    # Sort paths by weight in descending order
    sorted_paths = sorted(path_weights, key=lambda x: x[1], reverse=True)
    
    # Calculate the number of paths to keep
    paths_to_keep = max(1, int(len(sorted_paths) * top_quantile))
    
    # Return only the paths (not the weights) from the top quantile
    return [path for path, _ in sorted_paths[:paths_to_keep]]

In [None]:
top_paths = get_top_quantile_paths(weighted_network,'human','future')

In [None]:
top_paths

In [None]:
def extract_unique_words(lists_of_lists):
    # Initialize a set to store unique words
    unique_words = set()
    
    # Iterate through each list in the list of lists
    for sublist in lists_of_lists:
        # Iterate through each word in the sublist
        for word in sublist:
            # Add the word to the set
            unique_words.add(word)
    
    # Join the unique words into a single string separated by spaces
    result_string = ' '.join(unique_words)
    emos.draw_formamentis_flower(result_string)
    return result_string

extract_unique_words(emos.find_all_shortest_paths(weighted_network,'human','future'))

In [None]:
emos.plot_mindset_stream(weighted_network,'community','future',shortest_paths=top_paths)


In [None]:
emos.plot_mindset_stream(weighted_network,'human','future')


In [None]:
def path_metrics(weighted_network, start_node, end_node, shortest_paths=None, top_quantile=None):


    if shortest_paths == None:
            shortest_paths = emos.find_all_shortest_paths(weighted_network, start_node, end_node)
    if top_quantile != None and shortest_paths == None:
        try:
            shortest_paths = emos.get_top_quantile_shortest_paths(
                graph, start_node, end_node, top_quantile=top_quantile
            )
        except:
            raise ValueError(
                "If a quantile is set, weights should be necessary in the graph."
            )

    positive, negative, ambivalent = _valences('english')

    def get_valence(word):
        if word in positive:
            return 'positive'
        elif word in negative:
            return 'negative'
        else:
            return 'neutral'
    
    print(get_valence('change'))

    # Unweighted network analysis
    positive_edges = 0
    negative_edges = 0
    all_positive_paths = 0
    all_negative_paths = 0
    all_neutral_paths = 0

    for path in shortest_paths:
        valences = [get_valence(word) for word in path]
        
        # Count edges
        for i in range(len(path) - 1):
            if valences[i] == 'positive' and valences[i+1] == 'positive':
                positive_edges += 1
            elif valences[i] == 'negative' and valences[i+1] == 'negative':
                negative_edges += 1
        
        # Count paths
        if all(v == 'positive' for v in valences):
            all_positive_paths += 1
        elif all(v == 'negative' for v in valences):
            all_negative_paths += 1
        elif all(v == 'neutral' for v in valences):
            all_neutral_paths += 1

    print("Unweighted Network Metrics:")
    print(f"Edges connecting 2 positive nodes: {positive_edges}")
    print(f"Edges connecting 2 negative nodes: {negative_edges}")
    print(f"Paths where all words are positive: {all_positive_paths}")
    print(f"Paths where all words are negative: {all_negative_paths}")
    print(f"Paths where all words are neutral: {all_neutral_paths}")

    # Weighted network analysis
    weight_pos_pos = 0
    weight_pos_neutral = 0
    weight_neg_neutral = 0
    weight_neg_neg = 0
    weight_neutral_neutral = 0

    for node1, node2, weight in weighted_network:
        valence1 = get_valence(node1)
        valence2 = get_valence(node2)
        
        if valence1 == 'positive' and valence2 == 'positive':
            weight_pos_pos += weight
        elif (valence1 == 'positive' and valence2 == 'neutral') or (valence1 == 'neutral' and valence2 == 'positive'):
            weight_pos_neutral += weight
        elif (valence1 == 'negative' and valence2 == 'neutral') or (valence1 == 'neutral' and valence2 == 'negative'):
            weight_neg_neutral += weight
        elif valence1 == 'negative' and valence2 == 'negative':
            weight_neg_neg += weight
        elif valence1 == 'neutral' and valence2 == 'neutral':
            weight_neutral_neutral += weight

    print("\nWeighted Network Metrics:")
    print(f"Total weight of edges between positive nodes: {weight_pos_pos}")
    print(f"Total weight of edges between positive and neutral nodes: {weight_pos_neutral}")
    print(f"Total weight of edges between positive and neutral nodes: {weight_neg_neutral}")
    print(f"Total weight of edges between negative nodes: {weight_neg_neg}")
    print(f"Total weight of edges between neutral nodes: {weight_neutral_neutral}")

    # Plotting
    labels = ['Positive-Positive', 'Positive-Neutral','Negative-Neutral', 'Negative-Negative', 'Neutral-Neutral']
    sizes = [weight_pos_pos, weight_pos_neutral,weight_neg_neutral, weight_neg_neg, weight_neutral_neutral]
    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
    plt.axis('equal')
    plt.title('Distribution of Edge Weights by Node Valence')
    plt.show()

# Example usage:

path_metrics(weighted_network,'human','future')
