In [8]:
import os
import numpy as np

%load_ext autoreload
%autoreload 2

#display all the results not only the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from node_embeddings.models import Undirected_MSM
from node_embeddings.plots.plotting_functions import *
import node_embeddings.utils as utils

dataset_name = "Gleditsch"
dataset_direction = "Undirected"
distance_matrix, lvl_to_nclust = None, None

# note that inside the kwargs there is a copy of the pdtrans
id_code, cg_method, year = utils.default_4_undirected_paper(dataset_name)
pdtrans, kwargs, total_levels = utils.dataset_loader(dataset_name, dataset_direction = dataset_direction, id_code = id_code, cg_method = cg_method, year = year)
print(f'-total_levels: {total_levels}',)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'utils'

### Modelling the pdtrans

In [27]:
# for multi_models use maxlMSM.dimX > 1 or LPCA
ssm_repr, confidence = "LPCA", 0.95

if ssm_repr == "CM":
	models_name = ["CM", "fitnCM", "fitnMSM", "degcMSM", "maxlMSM"]
	model_objective = "NetRec"
	# models_name = ["maxlMSM"]
	dims =        [1]*len(models_name)
elif ssm_repr == "LPCA":
	model_objective = "NodeEmb"
	models_name = ["LPCA"]#*2 + ["maxlMSM"]*4
	dims =        [(1,1),] #(8,8)] + [1,2,8,16] #+ [1,16] #[1,2,3,4,5,6,7,8,9,10,11,16] 
	# models_name, dims = ["maxlMSM"], [1]

In [28]:
print(f'-Estimated total_levels: {total_levels}',)
for level in np.arange(total_levels):
	kwargs.update({
				"level" : level,
				})
	net = Undirected_Graph(**kwargs)
	if net.bin_adj.sum() == net.bin_adj.size:
		total_levels = level

	# if level == 2: break
print(f'-Effective total_levels: {total_levels}',)

-Estimated total_levels: 6
-Effective total_levels: 6


In [29]:
for model_name, dim in zip(models_name, dims):
	loss_iters = int(1e3)
	list_rel_err = []
	if model_name.endswith("LPCA"):
		dimX = None
		dimB, dimC = dim
	else:
		dimX = dim
		dimB, dimC = None, None
		
	for level in np.arange(total_levels):
		print(f'\n-Level: {level} for {model_name} - dim {dim}',)
		
		track_nll_opt = True if model_name == "maxlMSM" else False
		kwargs.update({
				"level" : level,
				})

		net = Undirected_Graph(**kwargs)
		
		und_density = 2 * net.n_edges / (net.n_nodes * (net.n_nodes - 1))
		print(f'-net.n_nodes, net.n_edges, und_density: {net.n_nodes}, {net.n_edges}, {und_density:.4}',)

		model_kwargs = {
				"name" : model_name,
				"dimX" : dimX,
				"dimB" : dimB,
				"dimC" : dimC,
				"pick_l" : 0,
				"pdtrans" : net.pdtrans,
				"initial_guess" : "random",
				"seed" : 0,
				"fc_direction" : "cg",
				"dataset_name" : "2022-Undirected",
				"objective" : model_objective,
			}
		fit_model = Undirected_MSM(obs_net = net, **model_kwargs)

		if net.n_nodes - net.fc_nodes.size > 0 and ~np.any(net.deg == 0):
   
			if os.path.exists(fit_model.model_dir+f"/X.csv"):
				print(f"-LOADING the (reassigned) fitnesses from {fit_model.model_dir}/X.csv'")
			else:
				print(f'-Fitting the {fit_model.name} model with reduced_by {fit_model.reduced_by}',)
				
				fit_model._set_initial_guess()
				A = fit_model.zl_notdet_A if fit_model.get("reduced_by") == "neigh" else net.zl_bin_adj
				
				fit_model._set_minimizers(A = A, lr = 5e-3, track_nll_opt = track_nll_opt)

				if fit_model.name.startswith("fitn"):
					if dataset_name == "ING":
						# not needed anymore
						mic_sseq = 0
						# mic_sseq = net.stre.reshape(-1, 1) #np.genfromtxt(net.model_dir + , delimiter = ",").reshape(-1, 1)
					elif dataset_name == "Gleditsch":
						mic_sseq = np.genfromtxt(net.dataset_folder + f"/node_gdp.csv", delimiter = ",").reshape(-1, 1)

					_ = fit_model.n_edges_fit(obs_net = net, mic_ext_var = mic_sseq)

				elif fit_model.name.startswith(("degcMSM", "CM")):
					_ = fit_model.deg_fit()
				
				elif fit_model.name.startswith(("maxlMSM", "LPCA")):
					n_epochs = 20
					opt_params = {"n_epochs" : {"adam": n_epochs, "tnc" : n_epochs}}

					for method in opt_params["n_epochs"]:
						fit_model.opt_method = method
						# minimize the NLL for the upper-triangular part of the matrix
						fit_model.minimize(
										A = A, 
										n_epochs = opt_params["n_epochs"][method], 
										opt_method = method, loss_iters = loss_iters, 
										ftol = np.finfo(float).eps, verbose = False
										)
						
					# out of the method loop it is possible to save the fitnesses otherwise it won't be overwritten
					# replicate the fitnesses and insert the fully-connected nodes
					if fit_model.get("reduced_by"):
						print(f'-Insert Final Value of X',)
						fit_model.X = fit_model.repeat_streq_X(red_X = fit_model.X).reshape(-1, fit_model.dimBCX)
						fit_model.X = fit_model.set_fcfd_X().reshape(-1, fit_model.dimBCX)
	   				
					_ = fit_model.zl_pmatrix_func(x = fit_model.X, set_w_diag = True)
			
			# this to plot only fit_model model
			# plots_rec_bin_vs_deg_cm_roc_prc(net, fit_model, ref_model = fit_model, confidence = confidence, n_samples=int(4))
			
			# REDO this by updating only the model_kwargs and dropping flevel_pdtranss
			model_kwargs.update({"name" : f"sum-{model_name}", "reduced_by" : False})
			sum_model = Undirected_MSM(obs_net = net, **model_kwargs)
			sum_model.sum_Xw_prevl(obs_net = net, ref_model = fit_model)
			
			# track the relative error of the number of edges
			sum_model.track_n_edges(list_rel_err, net)
			
			# # plot all the models
			plots_rec_bin_vs_deg_cm_roc_prc(net, sum_model = sum_model, ref_model = fit_model, confidence = confidence, n_samples=int(1000))

			# signed rel error
			print(f'-signed_rel_err(x = sum_model.X, y = fit_model).sum(): {signed_rel_err(x = sum_model.X, y = fit_model.X).sum()}',)

		else:
			print('--------> The graph is COMPLETE or disconnected. Stop Fitting')
			break
		print("*"*40)
		
	# save the track n_edges after all the levels were exhausted
	if len(list_rel_err) == total_levels:
		np.savetxt(os.path.dirname(sum_model.model_dir)+f"/rel_err_n_edges_across_levels.csv", list_rel_err, delimiter = ",")


-Level: 0 for LPCA - dim (1, 1)
-net.n_nodes, net.n_edges, und_density: 182, 9993, 0.6067
-LOADING the (reassigned) fitnesses from /Users/jg25gs/Documents/code_local_files/outputs/datasets/Gleditsch-Undirected/vars/LPCA/dimB1/dimC1/geo-dist_ig-random/level0/X.csv'
-signed_rel_err(x = sum_model.X, y = fit_model).sum(): 0.0
****************************************

-Level: 1 for LPCA - dim (1, 1)
-net.n_nodes, net.n_edges, und_density: 152, 7305, 0.6365
-LOADING the (reassigned) fitnesses from /Users/jg25gs/Documents/code_local_files/outputs/datasets/Gleditsch-Undirected/vars/LPCA/dimB1/dimC1/geo-dist_ig-random/level1/X.csv'
-signed_rel_err(x = sum_model.X, y = fit_model).sum(): 57.27782531585542
****************************************

-Level: 2 for LPCA - dim (1, 1)
-net.n_nodes, net.n_edges, und_density: 122, 4822, 0.6533
-LOADING the (reassigned) fitnesses from /Users/jg25gs/Documents/code_local_files/outputs/datasets/Gleditsch-Undirected/vars/LPCA/dimB1/dimC1/geo-dist_ig-random/le

In [30]:
levels = range(total_levels) if dataset_name == "ING" else [0,1,4,5]
# colors = ["#22aaa1","#F4A261","#335c67","#e09f3e","#CA3C25", "#561F37"]


if fit_model.get("objective").endswith("NetRec"):
	# define the dictionary for the markers and colors
	name_mark_color = {'fitnMSM-1' : ['*', '#22aaa1'], 
						'fitnCM-1' : ['*', '#335c67'], 
						'CM-1' : ['<','#7F95D1' ], 
						'degcMSM-1' : ['^', '#e09f3e'], 
						'maxlMSM-1' : ['v', '#CA3C25']}

else:
	name_mark_color = {'LPCA-(1,1)' : ['o', '#22aaa1'], 
						'LPCA-(8,8)' : ['^', '#335c67'], 
						'maxlMSM-1' : ['o','#7F95D1' ], 
						'maxlMSM-2' : ['*', '#e09f3e'], 
						'maxlMSM-8' : ['P', '#CA3C25'], 
						"maxlMSM-16" : ['^', "#632B30"]}


# create another dictionary with the "sum-" + model names labels. Note that the keys will be alphabetically ordered
sum_mod_names = ["sum-" + x for x in name_mark_color]

# obtain markers and colors
name_mark_color = dict(zip(sum_mod_names, list(name_mark_color.values())))
# markers = [name_mark_color[x][0] for x in sum_mod_names]
# colors = [name_mark_color[x][1] for x in sum_mod_names]

idx_first_self_loop_model = 3

for level in np.arange(total_levels):
	print(f'-Level: {level}',)
	kwargs.update({
			"level" : level,
			"pdtrans" : pdtrans,
			})

	net = Undirected_Graph(**kwargs)
	if net.n_nodes - net.fc_nodes.size > 0 and ~np.any(net.deg == 0):
		plot_triangles_at_c(
							obs_net = net, ref_model = fit_model,
							name_mark_color = name_mark_color,
							n_points = int(2e1)
							)

		# # in this plot use also fitnMSM
		if 'sum-fitnMSM-1' in sum_mod_names:
			# for this plot substitute the previous colors, but preserve the order of the keys
			level_exoX_vs_topoX(obs_net = net, 
								sum_model = fit_model,
								name_mark_color = name_mark_color,
								sum_model_ms = 3)

		if level >= 1:
			if ssm_repr == "CM":
				plot_sum_vs_cg_pmatrix(net, fit_model, level, class_models = "fitn_models")
				plot_sum_vs_cg_pmatrix(net, fit_model, level, class_models = "degc_models")
			else:
				plot_sum_vs_cg_pmatrix(net, fit_model, level, class_models = "NE_models")

dims = [x.split("-")[-1] for x in name_mark_color.keys()]
plots_rel_err_n_edges_across_levels(sum_model, name_mark_color, dims = dims, levels = range(total_levels))

-Level: 0
-Level: 1
-Level: 2
-Level: 3
-Level: 4
-Level: 5


In [34]:
if ssm_repr == "CM":
	model_names_cc = ["CM", "degcMSM"] 
	# model_names_cc = ["CM", "fitnCM"]
	dims_cc = [1, 1]
else:
	model_names_cc = ["LPCA", "maxlMSM"]
	dims_cc = [(8,8), 16]

markers_cc = ['o', 'o']
level_cc = [0,2] if dataset_name == "ING" else [0,2]
colors_cc = {model_names_cc[0]: ref_model_color, model_names_cc[1] : sum_model_color}

print(f'-level_cc: {level_cc}',)
for net_meas in ["deg", "annd", "cc"]:
	print(f'-Plotting: {net_meas}',)
	# the scale now is linear since the zero value would have spoiled the range
	plot_cross_comparison(fit_model, model_names_cc, dims_cc, markers_cc, colors_cc, level_cc, kwargs, net_meas = net_meas)

-level_cc: [0, 2]
-Plotting: deg
-name, level, norm_diff: ('LPCA', 0, 0.14810012546252882)
-name, level, norm_diff: ('maxlMSM', 0, 0.23360286704263458)
-name, level, norm_diff: ('LPCA', 2, 0.29958221931892326)
-name, level, norm_diff: ('maxlMSM', 2, 0.22746777903277854)
-Plotting: annd
-name, level, norm_diff: ('LPCA', 0, 0.14810012546252882)
-name, level, norm_diff: ('maxlMSM', 0, 0.23360286704263458)
-name, level, norm_diff: ('LPCA', 2, 0.29958221931892326)
-name, level, norm_diff: ('maxlMSM', 2, 0.22746777903277854)
-Plotting: cc
-name, level, norm_diff: ('LPCA', 0, 0.14810012546252882)
-name, level, norm_diff: ('maxlMSM', 0, 0.23360286704263458)
-name, level, norm_diff: ('LPCA', 2, 0.29958221931892326)
-name, level, norm_diff: ('maxlMSM', 2, 0.22746777903277854)


In [32]:
if ssm_repr == "CM":
	for name in [x[:-2] for x in name_mark_color.keys()]:
		print(f'-name: {name}',)
		plots_net_meas_by_level(obs_net = net, ref_model = fit_model, levels = levels, name = name, dim = dim)
	if confidence:
		plot_rec_acc_by_level(net, fit_model, name_mark_color, total_levels, confidence)

elif ssm_repr == "LPCA":
	if confidence:
		plot_rec_acc_by_meas(net, fit_model, name_mark_color, total_levels, confidence)

plot_auc_roc_prc(net, name_mark_color, fit_model, total_levels, yscale = "linear")

-Plotting the Reconstruction Accuracy for the summed model
-sum-LPCA-(1,1), rec_acc: [1.         0.89473684 0.62295082 0.31521739 0.25806452 0.34375
 1.         0.57236842 0.41803279 0.35869565 0.29032258 0.21875
 0.98351648 0.71052632 0.59836066 0.55434783 0.43548387 0.5       ]
-sum-LPCA-(8,8), rec_acc: [1.         0.76973684 0.44262295 0.18478261 0.17741935 0.1875
 1.         0.30263158 0.20491803 0.11956522 0.06451613 0.09375
 1.         0.54605263 0.44262295 0.34782609 0.29032258 0.28125   ]
-sum-maxlMSM-1, rec_acc: [0.99450549 0.96052632 0.90983607 0.91304348 0.88709677 0.90625
 0.93406593 0.80921053 0.80327869 0.85869565 0.91935484 0.84375
 0.92857143 0.83552632 0.79508197 0.88043478 0.91935484 0.90625   ]
-sum-maxlMSM-2, rec_acc: [0.99450549 0.98026316 0.93442623 0.94565217 0.91935484 0.9375
 0.92307692 0.86184211 0.87704918 0.92391304 0.9516129  0.875
 0.92857143 0.84868421 0.89344262 0.90217391 0.93548387 0.90625   ]
-sum-maxlMSM-8, rec_acc: [1.         1.         1.         

For ``AIC and BIC`` table

In [None]:
if ssm_repr == "LPCA":
	def get_trained_dimensions(self, n_parents = 2):
		"get the dimensions trained"
		import os
		from pathlib import Path
		
		# folder_path = str(Path(self.model_dir).parents[n_parents])
		folder_path = f"{os.path.expanduser('~')}/Documents/code_local_files/outputs/datasets/{net.dataset_name}/vars/maxlMSM"
		dim_prefix = "dimX"

		# List all entries in the given folder path
		entries = os.listdir(folder_path)

		# Optional: Filter out directories, keep only files
		dimensions = [int(entry[len(dim_prefix):]) for entry in entries if os.path.isdir(os.path.join(folder_path, entry)) and entry.startswith(dim_prefix)]
		
		return sorted(dimensions)

	ref_model = fit_model
	extra_dims = get_trained_dimensions(fit_model)
	dims = [(1,1), (8,8)] + extra_dims
	models = ["LPCA"]*2 + ["maxlMSM"]*len(extra_dims)
	df = rescale_aic_bic_df(fit_model, models, dims, total_levels, "aic")
	df = rescale_aic_bic_df(fit_model, models, dims, total_levels, "bic")