In [14]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
import pickle
import os
import pandas as pd

In [15]:
dir_images = "images_test"

In [16]:
years = list(range(1951, 2026))

In [17]:
models = ['openai', 'intfloate5', 'colbert', "words-no-rep"]
types = ['tresh', 'mean', 'median']

In [18]:
dict_vectors_matrices = {}
for t in types:
    dict_vectors_matrices[t] = {}
    for model in models:
        if f"topdown_{t}_{model}.npy" in os.listdir("matrices"):
            dict_vectors_matrices[t][model] = np.load(f"matrices/topdown_{t}_{model}.npy")
        elif f"downtop_{t}_{model}.npy" in os.listdir("matrices"):
            dict_vectors_matrices[t][model] = np.load(f"matrices/downtop_{t}_{model}.npy")

In [19]:
def show_matrix(matrix_npy, matrix_name, years, output_dir, show = False):
    tick_labels = [year if int(year) % 5 == 0 else "" for year in years]
    plt.figure(figsize=(8, 8))
    ax = sns.heatmap(matrix_npy, cmap='coolwarm', vmin=0, vmax=1, cbar = False, xticklabels=tick_labels, yticklabels=tick_labels) #, cbar = True)
    
    
    # Imposta i tick principali ogni 5 anni
    major_ticks = [i + 0.5 for i, year in enumerate(years) if int(year) % 5 == 0]
    minor_ticks = [i + 0.5 for i, year in enumerate(years) if int(year) % 5 != 0]
    
    ax.set_xticks(major_ticks)  
    ax.set_xticklabels([year for year in years if int(year) % 5 == 0], fontsize=10)
    
    ax.set_yticks(major_ticks)
    ax.set_yticklabels([year for year in years if int(year) % 5 == 0], fontsize=10)
    
    
    ax.set_xticks(minor_ticks, minor=True)
    ax.set_yticks(minor_ticks, minor=True)
    
    ax.tick_params(axis='both', which='major', length=5, width=2)  # Più grandi e spessi
    
    ax.tick_params(axis='both', which='minor', length=2, width=1)  # Più piccoli e sottili
    
    plt.title(matrix_name)
    plt.savefig(f"{output_dir}/{matrix_name}.png", format="png", dpi=150)
    if show == True:
        plt.show()
    plt.close()

In [20]:
dict_vectors_matrices.keys()

dict_keys(['tresh', 'mean', 'median'])

In [21]:
dict_vectors_matrices['median'].keys()

dict_keys(['openai', 'intfloate5', 'colbert'])

In [22]:
from tqdm import tqdm

In [24]:
df = pd.DataFrame(columns=['method', 'model', 'matrix'])
if not os.path.exists(dir_images):
    os.mkdir(dir_images)    
for method in tqdm(dict_vectors_matrices.keys()):
    print(method)
    folder_path = f"{dir_images}/{method}"
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)    
    for model in dict_vectors_matrices[method].keys():
        show_matrix(dict_vectors_matrices[method][model], f"{method}_{model}", years, f"{dir_images}/{method}", show = False)
        df = pd.concat([df, pd.DataFrame([{"method":method, "model":model, "matrix":dict_vectors_matrices[method][model]}])], ignore_index=True)


  0%|          | 0/3 [00:00<?, ?it/s]

tresh


 33%|███▎      | 1/3 [00:02<00:04,  2.15s/it]

mean


 67%|██████▋   | 2/3 [00:02<00:01,  1.29s/it]

median


100%|██████████| 3/3 [00:04<00:00,  1.57s/it]


In [25]:
df

Unnamed: 0,method,model,matrix
0,tresh,openai,"[[0.17894736842105263, 0.1375, 0.195, 0.1775, ..."
1,tresh,intfloate5,"[[0.06315789473684211, 0.0825, 0.0875, 0.0875,..."
2,tresh,colbert,"[[0.021052631578947368, 0.0275, 0.03, 0.05, 0...."
3,mean,words-no-rep,"[[2.9263157894736844, 3.54, 3.58, 4.14, 3.6531..."
4,median,openai,"[[0.4027328856513724, 0.32589184185569203, 0.3..."
5,median,intfloate5,"[[0.20290968, 0.25855863, 0.23232666, 0.209459..."
6,median,colbert,"[[0.034699806326662364, 0.13775016139444804, 0..."


In [26]:
family_counts = dict(df["method"].value_counts())

def mean_matrices(matrices):
    return np.mean(np.array(matrices), axis=0).tolist()

family_means = df.groupby("method")["matrix"].apply(lambda x: mean_matrices(x)).reset_index()

family_means["model"] = "MEAN"

family_means = family_means[["method", "model", "matrix"]]

ffilt = [f for f in family_counts if family_counts[f]>=1]

family_means = family_means[family_means["method"].isin(ffilt)]

#df["Mean"] = 0

# Aggiungiamo le righe al DataFrame originale
df_final = pd.concat([df, family_means], ignore_index=True)
df_final = df_final.sort_values(by=["method", "model"], ascending=[False, False])

In [28]:
# In each "method" subfolder, save the heatmap of the mean matrix computed across all matrices for the different models of that method
mean_matrices_list = []
for method in set(df_final['method']):
    mean_matrix = df_final[(df_final['method'] == "tresh") & (df_final['model'] == "MEAN")]['matrix'].values[0]
    show_matrix(mean_matrix, f"{method}_MEAN", years, f"{dir_images}/{method}", show = False)
    mean_matrices_list.append(mean_matrix)


# Save the global mean matrix, computed from the mean matrices above, in the main folder for the matrix images
global_mean = mean_matrices(mean_matrices_list)
show_matrix(global_mean, f"global_MEAN", years, f"{dir_images}", show = False)