# Learning from networks project
### Evaluation of different Node Embedding algorithms
Members:<br>
- D'Emilio Filippo, id : 2120931
- Volpato Pietro, id : 2120825

### Information about the notebook

In [1]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import gzip
import re
import os

# configuration
Here you can properly configure the names of the graphs and the names of the embedding strategies. Use meaningful names.

In [2]:
graph_keys = ["facebook","citation","biological","CL","COX2"]
embedding_keys = ["LINE"]

### Functions temporary container

In [3]:
def plot_PCA(embeddings, graph_name = "G"):
    # Reduce dimensions to 2D using PCA
    pca = PCA(n_components=2)
    pca_embeddings = pca.fit_transform(embeddings)
    
    plt.scatter(pca_embeddings[:, 0], pca_embeddings[:, 1], s=10)
    plt.title(f"Visualization in 2D of the embeddings of {graph_name} graph.")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.figsize(5)
    plt.show()

#plot_PCA(embeddings_facebook["LINE"], "facebook")
#plot_PCA(embeddings_CL["LINE"], "CL")
#plot_PCA(embeddings_biological["LINE"], "biological")

# Loading the embeddings
Now we load the embeddings, which should be stored as a file in the /embeddings folder as a .npy file.<br>
*NOTE*: the file names must respect the format: "embeddings_{graph_key}_{embedding_key}.npy".<br>
Embeddings are stored in a dictionary of dictionaries.<br>
The first index refer to the graph (e.g. embeddings["facebook"] contains the embeddings of the facebook graph for every embedding technique).<br>
The second index refer to the embedding technique (e.g. embeddings["facebook"]["LINE"] cointans the embedding of facebook graph computed using LINE)

In [4]:
def load(name):
    """
    Loads a NumPy array from a file. If the file is not found, 
    displays a warning and returns None.

    Parameters:
        name (str): The name of the file (without extension) to load from the 'embeddings' directory.

    Returns:
        np.ndarray or None: The loaded NumPy array, or None if the file is not found.
    """
    file_name = f"embeddings/{name}.npy"
    if not os.path.exists(file_name):
        print(f"Warning: File '{file_name}' not found.")
        return None

    emb = np.load(file_name)
    return emb

embeddings = {}
for k in graph_keys:
    embeddings[k] = {}

for graph_key in graph_keys:
    for emb_key in embedding_keys:
        emb_key.lower()
        graph_key.lower()
        s = f"embeddings_{graph_key}_{emb_key}"
        embeddings[graph_key][emb_key] = load(s)

# Loading the graphs
Selected graphs:
- Facebook_combined    https://snap.stanford.edu/data/ego-Facebook.html          
- cit-Helpth           https://networkrepository.com/cit-HepTh.php             
- bio-CE-CX            https://networkrepository.com/bio-CE-CX.php             
- CL-100K-1d8-L9       https://networkrepository.com/CL-100K-1d8-L9.php ---- the graph has node labels
- COX2-MD              https://networkrepository.com/COX2-MD.php  ---- the graph has node labels

To run this notebook, adjust the paths to match where the files are saved in your PC.<br>
To keep paths as they are, create a "data" folder inside the directory of this notebook, and store the files there.<br><br>

Graphs are stored as a dictionary: the key is the graph name, the value is the corresponding netowrkx graph.<br>

In [5]:
facebook_path = 'data/facebook_combined.txt.gz'
citation_path = 'data/cit-HepTh.edges'
biological_path = 'data/bio-CE-CX.edges'
CL_path = "data/CL-100K-1d8-L9/CL-100K-1d8-L9.edges"
COX2_path = "data/COX2-MD/COX2-MD.edges"

In [12]:
def load_graph(path):
    """
    For files with extension .edges
    """
    G = nx.Graph()
    with open(path, 'rt') as f:
        for line in f:
            if line.startswith('%'):  # Skip comment lines
                continue
            # Split the line based on spaces or commas
            data = re.split(r'[,\s]+', line.strip())
            if len(data) < 2:  # Skip lines that don't have at least two columns
                continue
            # Extract the first two columns (nodes)
            node1, node2 = int(data[0]), int(data[1])
            G.add_edge(node1, node2)
    G = nx.convert_node_labels_to_integers(G)  # Relabel nodes to integers
    return G

def load_graph_with_gz(path):
    """
    For files with extension .txt.gz
    """
    G = nx.Graph()
    with gzip.open(path, 'rt') as f:
        for line in f:
            node1, node2 = map(int, line.strip().split())
            G.add_edge(node1, node2)
    G = nx.convert_node_labels_to_integers(G)  # Relabel nodes to integers
    return G

def print_graphs_info(graphs):
    for k in graph_keys:
        G = graphs[k]
        print(f"{k}: |V|={len(G.nodes)}, |E|={len(G.edges)}")

In [13]:
graphs = {}

# facebook graph is the only one .tar.gz        
graphs[graph_keys[0]] = load_graph_with_gz(facebook_path)  # relabeling nodes to integer
graphs[graph_keys[1]] = load_graph(citation_path)
graphs[graph_keys[2]] = load_graph(biological_path)
graphs[graph_keys[3]] = load_graph(CL_path)  # node labeled
graphs[graph_keys[4]] = load_graph(COX2_path)  # node labeled

print_graphs_info(graphs)

facebook: |V|=4039, |E|=88234
citation: |V|=22908, |E|=2444798
biological: |V|=15229, |E|=245952
CL: |V|=92482, |E|=436611
COX2: |V|=7962, |E|=101542


# Reconstruction error

In [14]:
# The reconstruction error data structure is built as the embeddings data structure.
RE = {}
for k in graph_keys:
    RE[k] = {}

def reconstruction_error(G, embeddings):
    """
    Computes the reconstruction error of the graph by comparing cosine similarity
    only for existing edges in the graph, avoiding dense adjacency matrix computations.
    The reason is that for large graph an exact computation causes memory issues, due to very large matrices.

    Parameters:
        G (networkx.Graph): The input graph.
        embeddings (NumPy array): numpy array containing the embeddings, each row is a node embedding

    Returns:
        float: The reconstruction error as the average squared difference for existing edges.
    """
    # Convert embeddings to matrix
    embedding_vectors = np.array([embeddings[node] for node in G.nodes])

    # Compute similarities only for existing edges
    error = 0
    for u, v in G.edges():
        u_vec = embedding_vectors[u].reshape(1, -1)
        v_vec = embedding_vectors[v].reshape(1, -1)
        sim = cosine_similarity(u_vec, v_vec)[0, 0]
        error += (1 - sim) ** 2

    return error / G.number_of_edges()

def print_reconstruction_error(err, graph_name , embedding_technique):
    print(f"RE of {graph_name} graph using {embedding_technique}: {err}")

def compute_all_reconstruction_errors(graph_keys, embedding_keys, show_results = True):
    for graph_key in graph_keys:
        if show_results:
            print(f"\nReconstruction errors for {graph_key} graph:\n")
        for emb_key in embedding_keys:     
            RE[graph_key][emb_key]= reconstruction_error(graphs[graph_key], embeddings[graph_key][emb_key])
            if show_results:
                print_reconstruction_error(RE[graph_key][emb_key], graph_key, emb_key)

In [None]:
compute_all_reconstruction_errors(graph_keys, embedding_keys, show_results = True)

Reconstruction errors for facebook graph
RE of facebook graph using LINE: 0.11006272702076819
Reconstruction errors for citation graph
