In [None]:
import json
import os
import re
import pandas as pd
from emoatlas import EmoScores
import networkx as nx
import numpy as np
from pprint import pprint
from scipy.spatial.distance import cosine
from collections import defaultdict

In [None]:
base_dir = "localdb"
valence_dir = "Valence_Metrics"
language='eng'

emos = EmoScores()
emosita = EmoScores(language='italian')

# Folders of interest
folders_of_interest = ['climate', 'math', 'misinformation_health','gwarming']


In [None]:
models_data={}

for folder in folders_of_interest:
    folder_path = os.path.join(base_dir, folder)

    for filename in os.listdir(folder_path):
        
        fmnts=[]
        texts=[]

        file_path = os.path.join(folder_path, filename)
        
        if language=='ita':
            if '(ITA)' in file_path:
                with open(file_path, 'r') as file:
                    for line in file:
                        json_obj = json.loads(line)
                        fmnts.append(json_obj['fmnt']['syntactic'])
                        texts.append(' '.join(json_obj['lemmatized_test']))
                models_data[filename.rstrip('.jsonl')] = { 'Network': emosita.combine_edgelists(fmnts),
                                         'Texts': ' '.join(texts)
                }              
        if language=='eng':
            if '(ITA)' not in file_path:
                with open(file_path, 'r') as file:
                    for line in file:
                        json_obj = json.loads(line)
                        fmnts.append(json_obj['fmnt']['syntactic'])
                        texts.append(' '.join(json_obj['lemmatized_test']))
                models_data[filename.rstrip('.jsonl')] = { 'Network': emos.combine_edgelists(fmnts),
                                         'Texts': ' '.join(texts)
                }              


In [None]:
def create_graph(network_data):
    G = nx.Graph()
    for edge in network_data:
        G.add_edge(edge[0], edge[1], weight=edge[2])
    return G

def compute_metrics(G, text):
    strength = dict(nx.degree(G, weight='weight'))
    degree = dict(nx.degree(G))
    closeness = nx.closeness_centrality(G)
    
    # Compute word frequencies
    word_frequencies = defaultdict(int)
    words = text.split()  # Split text into words
    
    for word in words:
        word_frequencies[word] += 1  # Count frequency of each word 
    word_frequencies=dict(word_frequencies)
    return strength, degree, closeness, word_frequencies



In [None]:
def analyze_models():
    results = {model:{} for model in models_data}
    all_tf_words = {}
    all_degree_words={}

    for model in models_data:
        print('doing ', model)
        G = create_graph(models_data[model]['Network'])
        strength, degree, closeness, tf = compute_metrics(G, models_data[model]['Texts'])
        
        results[model]['strength'] = strength
        results[model]['degree'] = degree
        results[model]['closeness'] = closeness
        results[model]['tf'] = tf

        all_tf_words[model] = set(degree.keys())

    common_words = set.intersection(*all_tf_words.values())
    
    # Filter out words not used in all models
    for model in results:
        results[model]['strength'] = {word: value for word, value in results[model]['strength'].items() if word in common_words}
        results[model]['degree'] = {word: value for word, value in results[model]['degree'].items() if word in common_words}
        results[model]['closeness'] = {word: value for word, value in results[model]['closeness'].items() if word in common_words}
        results[model]['tf'] = {word: value for word, value in results[model]['tf'].items() if word in common_words}

    all_words_used = set().union(*all_degree_words.values(), *all_tf_words.values())
    words_not_in_all_models = all_words_used - common_words
    if words_not_in_all_models:
        print("Words not used by every model:", words_not_in_all_models)
    
    return results

results = analyze_models()



In [None]:
def clean_dictionary_keys(data):
    cleaned_data = {}
    
    # Find common words across all models and all inner dictionaries
    all_word_sets = [
        set(inner_dict.keys())
        for model_data in data.values()
        for inner_dict in model_data.values()
    ]
    common_words = set.intersection(*all_word_sets)
    
    print(f"Number of common words across all models: {len(common_words)}")
    
    for model, model_data in data.items():
        print(f"Model: {model}")
        
        # Create a new dictionary for this model with only common words
        cleaned_data[model] = {
            inner_dict_type: {word: value for word, value in inner_dict.items() if word in common_words}
            for inner_dict_type, inner_dict in model_data.items()
        }
        
        # Print the length of each metric after cleaning
        for metric_name, metric_values in cleaned_data[model].items():
            print(f"  Metric: {metric_name}, Len: {len(metric_values)}")
        
        print("---")
    
    return cleaned_data

results=clean_dictionary_keys(results)
results

In [None]:
wordstoremove={'math':['math','anxiety','matematica','ansia'],
               'climate':['climate','change','cambiamento','climatico'],
               'gwarming':['global','warming','riscaldamento','globale'],
               'misinformation_health':['misinformation','health','disinformazione','salute','bufala','teoria','complotto','information','theory','conspiracy'],}

def remove_keys(d):
    if isinstance(d, dict):
        # Filter out keys named 'climate' or 'change'
        d = {k: remove_keys(v) for k, v in d.items() if k.lower() not in wordstoremove[folders_of_interest[0]]}
    return d

results2=remove_keys(results)

In [None]:
def compute_euclidean_differences(results):

    import math
    # Calculate Euclidean distance for 'strength', 'degree', 'closeness', and 'tf' for each model
    for model in results:
        other_models = [m for m in results if m != model]
        
        for node in results[model]['strength']:
            distances = [(results[model]['strength'][node] - results[other_model]['strength'].get(node, 0))**2 for other_model in other_models]
            euclidean_distance = math.sqrt(sum(distances))
            results[model].setdefault('diff_strength', {})[node] = euclidean_distance
        
        for node in results[model]['degree']:
            distances = [(results[model]['degree'][node] - results[other_model]['degree'].get(node, 0))**2 for other_model in other_models]
            euclidean_distance = math.sqrt(sum(distances))
            results[model].setdefault('diff_degree', {})[node] = euclidean_distance

        for node in results[model]['closeness']:
            distances = [(results[model]['closeness'][node] - results[other_model]['closeness'].get(node, 0))**2 for other_model in other_models]
            euclidean_distance = math.sqrt(sum(distances))
            results[model].setdefault('diff_closeness', {})[node] = euclidean_distance
        
        for word in results[model]['tf']:
            distances = [(results[model]['tf'][word] - results[other_model]['tf'].get(word, 0))**2 for other_model in other_models]
            euclidean_distance = math.sqrt(sum(distances))
            results[model].setdefault('diff_tf', {})[word] = euclidean_distance

    # Example output for checking
    for model in results:
        print(f"Model: {model}")
        if 'believe' in results[model]['diff_strength']:
            print(f"Euclidean Distance Strength for 'believe': {results[model]['diff_strength']['believe']}")
        if 'believe' in results[model]['diff_degree']:
            print(f"Euclidean Distance Degree for 'believe': {results[model]['diff_degree']['believe']}")
        if 'believe' in results[model]['diff_closeness']:
            print(f"Euclidean Distance Closeness for 'believe': {results[model]['diff_closeness']['believe']}")
        if 'believe' in results[model]['diff_tf']:
            print(f"Euclidean Distance TF for 'believe': {results[model]['diff_tf']['believe']}")
    return results

In [None]:
euclidean_differences=compute_euclidean_differences(results2)

In [None]:
euclidean_differences.keys()

In [None]:
def create_standardized_dataframe(results):
    # Step 1-4: Create initial DataFrame and calculate z-scores (unchanged)
    df = pd.concat({k: pd.DataFrame(v['strength'].items(), columns=['node', 'strength']) for k, v in results.items()}, axis=1)

    display(df[df.isna().any(axis=1)])

    # Delete rows with NaN values
    df = df.dropna()

    for model in results:
        df[(model, 'closeness')] = df[(model, 'node')].map(results[model]['closeness'])
        df[(model, 'degree')] = df[(model, 'node')].map(results[model]['degree'])
        df[(model, 'tf')] = df[(model, 'node')].map(results[model]['tf'])
    
    for model in results:
        #df[(model, 's')] = (df[(model, 'strength')]/1000).round(1)
        df[(model, 'd')] = (df[(model, 'degree')]/100).round(1)
        df[(model, 'f')] = (df[(model, 'tf')]/1000).round(1)
        df[(model, 'c')] = df[(model, 'closeness')].round(2)

    for model in results:
        #df[(model, 'Ds')] = df[(model, 'node')].map(results[model]['diff_strength'])
        df[(model, 'Dd')] = df[(model, 'node')].map(results[model]['diff_degree'])
        df[(model, 'Df')] = df[(model, 'node')].map(results[model]['diff_tf'])
        df[(model, 'Dc')] = df[(model, 'node')].map(results[model]['diff_closeness'])

    for model in results:
        #df[(model, 'Ds')] = (df[(model, 'Ds')]/1000).round(1)
        df[(model, 'Dd')] = (df[(model, 'Dd')]/100).round(1)
        df[(model, 'Df')] = (df[(model, 'Df')]/1000).round(1)
        df[(model, 'Dc')] = df[(model, 'Dc')].round(2)
   
    
    for model in results:
        for metric in ['degree', 'tf', 'closeness']: #add strength here if you need it
            mean = df[(model, metric)].mean()
            std = df[(model, metric)].std()
            if std == 0:
                df[(model, f'{metric}_zscore')] = 0
            else:
                df[(model, f'{metric}_zscore')] = (df[(model, metric)] - mean) / std

    # Step 5: Sort each model's data by nodes
    sorted_dfs = {}
    for model in results:
        model_columns = [(model, col) for col in df.columns.levels[1] if col in ['node','d','f','c', 'degree_zscore', 'tf_zscore','closeness_zscore','Dd','Df','Dc']] #add s and strength_zscore here if you need it
        sorted_dfs[model] = df[model_columns].sort_values(by=(model, 'node')).reset_index(drop=True)
    
    
    ## Step 6: Calculate cosine similarities
    #models = list(results.keys())
    #for i, model1 in enumerate(models):
    #    for j, model2 in enumerate(models):
    #        if i < j:  # Only calculate for unique pairs
    #            vec1 = sorted_dfs[model1][[
    #                (model1, 'strength_zscore'),
    #                #(model1, 'closeness_zscore'),
    #                (model1, 'degree_zscore'),
    #                (model1, 'tf_zscore')
    #            ]].values
    #            vec2 = sorted_dfs[model2][[
    #                (model2, 'strength_zscore'),
    #                #(model2, 'closeness_zscore'),
    #                (model2, 'degree_zscore'),
    #                (model2, 'tf_zscore')
    #            ]].values
    #            
    #            cosine_sim = np.array([
    #                abs(1 - cosine(v1, v2) if not np.isnan(v1).any() and not np.isnan(v2).any() else np.nan)
    #                for v1, v2 in zip(vec1, vec2)
    #            ])
    #            
    #            sorted_dfs[model1][(model1, f'S{model2[0]}')] = cosine_sim
    #            sorted_dfs[model2][(model2, f'S{model1[0]}')] = cosine_sim
        # Step 6: Calculate average cosine similarity
    models = list(results.keys())
    for i, model1 in enumerate(models):
        cosine_sims = []
        for j, model2 in enumerate(models):
            if i != j:  # Calculate for all pairs except self
                vec1 = sorted_dfs[model1][[
                    (model1, 'closeness_zscore'),
                    (model1, 'degree_zscore'),
                    (model1, 'tf_zscore')
                ]].values
                vec2 = sorted_dfs[model2][[
                    (model2, 'closeness_zscore'),
                    (model2, 'degree_zscore'),
                    (model2, 'tf_zscore')
                ]].values
                
                cosine_sim = np.array([
                    abs(1 - cosine(v1, v2) if not np.isnan(v1).any() and not np.isnan(v2).any() else np.nan)
                    for v1, v2 in zip(vec1, vec2)
                ])
                
                cosine_sims.append(cosine_sim)
        
        avg_cosine_sim = np.nanmean(cosine_sims, axis=0)
        sorted_dfs[model1][(model1, 'AvgS')] = avg_cosine_sim          

    #for model in results:
    #    model_columns = [(model, col) for col in ['node', 'strength_zscore', 'closeness_zscore', 'degree_zscore', 'tf_zscore']]
    # Step 7: Sort each model's data by strength_zscore

    for model in results:
        sorted_dfs[model] = sorted_dfs[model].sort_values(by=(model, 'f'), ascending=False).reset_index(drop=True)

    
    # Step 8: Concatenate sorted DataFrames
    df_sorted = pd.concat(sorted_dfs.values(), axis=1)

    for model in results:
        for metric in ['closeness_zscore', 'degree_zscore', 'tf_zscore']:
            del df_sorted[(model, metric)]  


    for column in df_sorted.columns:
        if column[1] != 'AvgS':
            df_sorted[column] = df_sorted[column].astype(str)


    # Split the dataframe into two parts
    models = list(results.keys())
    print(models)
    models = ['gpt-3.5','Haiku','mistral-7b','Llama-3-8B']
    df_first_two = df_sorted.loc[:, models[:2]]
    df_second_two = df_sorted.loc[:, models[2:]]

    captiondictionary={'climate':'Climate Change',
                       'gwarming':'Global warming',
                       'math': 'Math anxiety',
                       'misinformation_health': 'Misinformation in health'}
    captionlanguage={'ita':'Italian',
                     'eng':'English'}

    # Function to create LaTeX table
    def create_latex_table(df, caption):
        caption = f"Top 20 nodes (based on frequency) for the {captiondictionary[folders_of_interest[0]]} topic in {captionlanguage[language]}. "
        caption += f"Columns show node name (node), degree (d, divided by 100), frequency (f, divided by 1000), "
        caption += f"and their Euclidean differences (Dd, Df, Dc) to other models. "
        caption += "AvgS represents the average cosine similarity with other models."
        caption += "Note: d and Dd are shown after division by 100, while f and Df are shown after division by 1000."


        latex_table = "\\begin{table}[htbp]\n\\centering\n\\setlength{\\tabcolsep}{4pt}\n\\footnotesize\n"
        latex_table += df.head(20).to_latex(index=False, float_format="%.2f")
        latex_table += f"\\caption{{{caption}}}\n\\label{{tab:mytable}}\n\\end{{table}}"
        return latex_table

    # Create LaTeX tables
    latex_table1 = create_latex_table(df_first_two, f"Top 20 nodes (based on f) for {folders_of_interest[0]}")
    latex_table2 = create_latex_table(df_second_two, f"Top 20 nodes (based on f) for  {folders_of_interest[0]}")
    #latex_output = df_sorted.head(20).to_latex(index=False, float_format=f"%.1f")
    print(latex_table1, '\n')
    print(latex_table2)

    return df_sorted

In [None]:
pd.set_option('display.max_columns', None)
display(create_standardized_dataframe(euclidean_differences).head(20))