# Learning from networks project: Evaluation of different Node Embedding algorithms
Members:<br>
- D'Emilio Filippo, id : 2120931
- Volpato Pietro, id : 2120825

## Embedding evaluation notebook

In [1]:
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import gzip
import re
import os

# configuration
Here you can properly configure the names of the graphs and the names of the embedding strategies. Use meaningful names.

In [2]:
graph_keys = ["facebook","citation","biological","CL","COX2"]
embedding_keys = ["LINE", "node2vec"]

### Functions temporary container

In [3]:
def plot_PCA(embeddings, graph_name = "G"):
    # Reduce dimensions to 2D using PCA
    pca = PCA(n_components=2)
    pca_embeddings = pca.fit_transform(embeddings)
    
    plt.scatter(pca_embeddings[:, 0], pca_embeddings[:, 1], s=10)
    plt.title(f"Visualization in 2D of the embeddings of {graph_name} graph.")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.figsize(5)
    plt.show()

#plot_PCA(embeddings_facebook["LINE"], "facebook")
#plot_PCA(embeddings_CL["LINE"], "CL")
#plot_PCA(embeddings_biological["LINE"], "biological")

# Loading the embeddings
Now we load the embeddings, which should be stored as a file in the /embeddings folder as a .npy file.<br>
*NOTE*: the file names must respect the format: "embeddings_{graph_key}_{embedding_key}.npy".<br>
Embeddings are stored in a dictionary of dictionaries.<br>
The first index refer to the graph (e.g. embeddings["facebook"] contains the embeddings of the facebook graph for every embedding technique).<br>
The second index refer to the embedding technique (e.g. embeddings["facebook"]["LINE"] cointans the embedding of facebook graph computed using LINE)

In [4]:
def load(name):
    """
    Loads a NumPy array from a file. If the file is not found, 
    displays a warning and returns None.

    name (str): The name of the file (without extension) to load from the 'embeddings' directory.
    
    return: np.ndarray or None: The loaded NumPy array, or None if the file is not found.
    """
    file_name = f"../result/{name}.npy"
    if not os.path.exists(file_name):
        print(f"Warning: File '{file_name}' not found.")
        return None

    emb = np.load(file_name)
    return emb

embeddings = {}
for k in graph_keys:
    embeddings[k] = {}

for graph_key in graph_keys:
    for emb_key in embedding_keys:
        emb_key.lower()
        graph_key.lower()
        s = f"embeddings_{graph_key}_{emb_key}"
        embeddings[graph_key][emb_key] = load(s)



# Loading the graphs
Selected graphs:
- Facebook_combined    https://snap.stanford.edu/data/ego-Facebook.html          
- cit-Helpth           https://networkrepository.com/cit-HepTh.php             
- bio-CE-CX            https://networkrepository.com/bio-CE-CX.php             
- CL-100K-1d8-L9       https://networkrepository.com/CL-100K-1d8-L9.php ---- the graph has node labels
- COX2-MD              https://networkrepository.com/COX2-MD.php  ---- the graph has node labels

To run this notebook, adjust the paths to match where the files are saved in your PC.<br>
To keep paths as they are, create a "data" folder inside the directory of this notebook, and store the files there.<br><br>

Graphs are stored as a dictionary: the key is the graph name, the value is the corresponding netowrkx graph.<br>

In [5]:
facebook_path = '../data/facebook_combined.txt.gz'
citation_path = '../data/cit-HepTh.edges'
biological_path = '../data/bio-CE-CX.edges'
CL_path = "../data/CL-100K-1d8-L9.edges"
COX2_path = "../data/COX2-MD.edges"

In [6]:
def load_graph(path):
    """
    For files with extension .edges
    """
    G = nx.Graph()
    with open(path, 'rt') as f:
        for line in f:
            if line.startswith('%'):  # Skip comment lines
                continue
            # Split the line based on spaces or commas
            data = re.split(r'[,\s]+', line.strip())
            if len(data) < 2:  # Skip lines that don't have at least two columns
                continue
            # Extract the first two columns (nodes)
            node1, node2 = int(data[0]), int(data[1])
            G.add_edge(node1, node2)
        
    return relabel_get_mapping(G)

def load_graph_with_gz(path):
    """
    For files with extension .txt.gz
    """
    G = nx.Graph()
    with gzip.open(path, 'rt') as f:
        for line in f:
            node1, node2 = map(int, line.strip().split())
            G.add_edge(node1, node2)
            
    return relabel_get_mapping(G)

def print_graphs_info(graphs):
    for k in graph_keys:
        G = graphs[k]
        print(f"{k}: |V|={len(G.nodes)}, |E|={len(G.edges)}")

def relabel_get_mapping(G):
    """
    Given a graph G, this function returns a graph where the nodes are relabeled as integers, form 0 to |V|-1.
    It is also returned the mapping from relabeled name to original name.
    """
    mapping = {node : i for i,node in enumerate(G.nodes)} # mappoing original : relabeled
    G = nx.relabel_nodes(G, mapping)
    return G, mapping

In [8]:
graphs = {}  # dictionary containg the graphs
mappings = {} # dictionary to contain the mappings. Original name : relabeled name
for k in graph_keys:
    mappings[k] = {}
    
# facebook graph is the only one .tar.gz        
graphs[graph_keys[0]], mappings[graph_keys[0]] = load_graph_with_gz(facebook_path)  # relabeling nodes to integer
graphs[graph_keys[1]], mappings[graph_keys[1]] = load_graph(citation_path)
graphs[graph_keys[2]], mappings[graph_keys[2]] = load_graph(biological_path)
graphs[graph_keys[3]], mappings[graph_keys[3]] = load_graph(CL_path)  # node labeled
graphs[graph_keys[4]], mappings[graph_keys[4]] = load_graph(COX2_path)  # node labeled

print_graphs_info(graphs)

facebook: |V|=4039, |E|=88234
citation: |V|=22908, |E|=2444798
biological: |V|=15229, |E|=245952
CL: |V|=92482, |E|=436611
COX2: |V|=7962, |E|=101542


# Reconstruction error

In [None]:
# The reconstruction error data structure is built as the embeddings data structure.
RE = {}
for k in graph_keys:
    RE[k] = {}

def reconstruction_error(G, embeddings):
    """
    Computes the reconstruction error of the graph by comparing cosine similarity
    only for existing edges in the graph, avoiding dense adjacency matrix computations.
    The reason is that for large graphs an exact computation causes memory issues, due to very large adjacency matrices.

    Parameters:
        G (networkx.Graph): The input graph.
        embeddings (NumPy array): numpy array containing the embeddings, each row is a node embedding

    Returns:
        float: The reconstruction error as the average squared difference for existing edges.
    """
    # Convert embeddings to matrix
    embedding_vectors = np.array([embeddings[node] for node in G.nodes])

    # Compute similarities only for existing edges
    error = 0
    for u, v in G.edges():
        u_vec = embedding_vectors[u].reshape(1, -1)
        v_vec = embedding_vectors[v].reshape(1, -1)
        sim = cosine_similarity(u_vec, v_vec)[0, 0]
        error += (1 - sim) ** 2

    return error / G.number_of_edges()

def print_reconstruction_error(err, graph_name , embedding_technique):
    print(f"RE of {graph_name} graph using {embedding_technique}: {err}")

def compute_all_reconstruction_errors(graph_keys, embedding_keys, show_results = True):
    for graph_key in graph_keys:
        if show_results:
            print(f"\nReconstruction errors for {graph_key} graph:\n")
        for emb_key in embedding_keys:     
            RE[graph_key][emb_key]= reconstruction_error(graphs[graph_key], embeddings[graph_key][emb_key])
            if show_results:
                print_reconstruction_error(RE[graph_key][emb_key], graph_key, emb_key)

## Compute the RE
Here you can compute the reconstruction error.<br>
- Set graph_keys_RE with the keys of the graphs you are interested. graph_keys_RE = graph_keys for all graphs.<br>
- set embedding_keys_RE with the keys of the embedding strategies you are interested. graph_keys_RE = embedding_keys for all embedding strategies.<br>

In [None]:
graph_keys_RE = ["biological"]
embedding_keys_RE = ["LINE"]
compute_all_reconstruction_errors(graph_keys_RE, embedding_keys_RE, show_results = True)

# NODE CLASSIFICATION
There are two graphs with node labels: CL graph and COX2 graph.<br>

To do node classification we train a SVM. The dataset is composed of the embeddings of one of those two graphs, obtained with one of the embeddings algorithms considered, and of the node labels (multi-class classification task).

The node classification consider the TRANSDUCTIVE CASE: some embeddings-labels are kept for the test set, but to compute the embeddings we considered
the whole graph.

IMPORTANT: the lables of the "CL graph" are organized in a very weird way: the nodes of the graph are not sequential numbers (e.g. some numbers for some reason are missing), but the labels are, and they don't respect the names of the original nodes. <br>
For example, in the label file there is the entry 16,2 (node 16 has label 2), but in the original .edges file node 16 does not even exist. I assume that it means "the 16th node has label 2". <br>
The function to extract node labels then assume that the labels are listed in sequential order (e.g. entry 1 is the label of first node, entry 2 of the second node, ecc..). This requirement is satisfied for both the labeled graphs we are considering. <br>
We originally wanted to apply the mapping to the node-label pair, to correctly match the labels with the renamed nodes, but unfortunately because of this strange representation of the data for CL graph this is not possible, and we have to make this precise assumption.

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

In [10]:
COX2_labels_path = "../data/COX2-MD.node_labels"
CL_labels_path = "../data/CL-100K-1d8-L9.node_labels"

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

def load_node_labels(file_path):
    """
    Reads a file containing node labels and returns a dictionary mapping nodes to labels.
    The file can have two formats:
        1. Comma-separated: node,label
        2. Space-separated: node label
        
    IMPORTANT: this function assumes that labels starts from node '1' and each line
    represents the label of the nodes sequantially (e.g. line 1 label of first node,
    line 2 label of second node, ecc...).
    The motivation is given in the markdown cell above.
    Parameters: file_path : Path to the node label file.
    
    Return: A dictionary where keys are node IDs and values are labels.
    """
    node_labels = {}
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if not line:  # Skip empty lines
                continue

            if ',' in line:
                node, label = line.split(',')  # Comma-separated
            else:
                node, label = line.split()  # Space-separated
            
            node_labels[int(node)] = int(label)

    # labels has to start from index 0, like embeddings. In the considered files labels starts from 1 and proceed in order
    node_labels = {int(node)-1 :label for node, label in node_labels.items()} 
    return node_labels

def train_SVM(embeddings, labels):
    """
    Train and evaluate an SVM classifier for multi-class node classification.

    Parameters:
        embeddings (np.ndarray): Numpy array where each row is a node's embedding.
        labels (dict): Dictionary mapping node indices to their labels.

    Returns:
        dict: A dictionary with accuracy, F1 score, and a detailed classification report.
    """
    # Ensure X (features) and y (labels) are aligned
    X = np.array(embeddings)  # Node embeddings
    y = np.array([labels[i] for i in range(len(labels))])  # Ensure correct ordering of labels
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=69)

    clf = SVC(kernel='rbf', decision_function_shape='ovo')  # 'ovo' = one-vs-one for multi-class
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    report = classification_report(y_test, y_pred)

    # Print results
    print("SVM Classifier Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Macro F1-Score: {f1:.4f}")
    print("Classification Report:\n", report)

    # Return results as a dictionary
    return {"accuracy": accuracy, "macro_f1": f1, "report": report}

labels = {}
labels["COX2"] = load_node_labels(COX2_labels_path)
labels["CL"] = load_node_labels(CL_labels_path)

# Analysis of the datasets
It is always a good idea to have a look at the datasets we are dealing with.<br>
- features are the embeddings of the nodes, which is an D-dimensional vector, where D is the dimension of the specific embeddings
we are using to training.
- the labels represent the classes of the nodes. We can analyze the labels set to see how many different classes there are and understand how balanced the dataset is.

In [17]:
def analyze_labels(labels):
    labels_count = {}
    for label in labels.values():
        if not label in labels_count.keys():
            labels_count[label] = 0
        labels_count[label] += 1
    print(f"Number of samples: {len(labels)}")
    print(f"Number of classes: {len(labels_count)}")
    for label in labels_count.keys():
        print(f"Samples of class {label}: {labels_count[label]}")

print("INFO DATASET COX2 GRAPH")
analyze_labels(labels["COX2"])

print("\nINFO DATASET CL GRAPH")
analyze_labels(labels["CL"])

INFO DATASET COX2 GRAPH
Number of samples: 7962
Number of classes: 7
Samples of class 1: 5575
Samples of class 2: 599
Samples of class 3: 571
Samples of class 4: 354
Samples of class 5: 767
Samples of class 6: 89
Samples of class 7: 7

INFO DATASET CL GRAPH
Number of samples: 92482
Number of classes: 9
Samples of class 8: 10276
Samples of class 9: 10274
Samples of class 2: 10276
Samples of class 6: 10276
Samples of class 1: 10276
Samples of class 3: 10276
Samples of class 5: 10276
Samples of class 4: 10276
Samples of class 7: 10276


In [None]:
embedding_key = "LINE"
graph_key = "CL"
train_SVM(embeddings[graph_key][embedding_key], labels[graph_key])