Frequently Bought Toghether Items

Inizializzazione, parametri e altre cose che tutti devono sapere

> Indented block



In [None]:
from google.colab import drive
drive.mount('/content/drive/')
import os
os.chdir("drive/MyDrive/_COLAB")

In [None]:
#Author: Everyone
import random
import networkx as nx
import node2vec
from gensim.models import Word2Vec
import os.path
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import time
import math

#Nodo target
target = -1 ##modificato in seguito a random

#All graphs to be tested
graphs = []

#directory with data files
data_dir = "data/"
#directory with saved embeddings
models_dir = "models/"
#directory with saved plots
plots_dir = "plots/"

if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(plots_dir):
    os.makedirs(plots_dir)

#filenames of graph data, should be inside ${data_dir}
files = ["Amazon0302" , "Amazon0312", "Amazon0505", "Amazon0601", "AmazonU"]
file_ext = ".txt"

#is represented graph directed?
directed = [True, True, True, True, False]

#embedding hyperparameters
#default = 1, 1, 10, 80
p=1
q=1
num_walks=10
walk_length=80

### LOAD GRAPHS ###
for i in range(len(files)):
    print("loading graph " + data_dir + files[i] + file_ext)
    if directed[i]:
        ##genera il grafo directed utilizzando nx.DiGraph
        graphs.append(nx.read_edgelist(data_dir + files[i] + file_ext, nodetype=int, create_using=nx.DiGraph))

    else:
        ##genera il grafo undirected utilizzando nx.Graph
        graphs.append(nx.read_edgelist(data_dir + files[i] + file_ext))

    nx.set_edge_attributes(graphs[i], 1, name='weight')

print("finished")

Embedding using stanford's node2vec

In [None]:
#Author:Savoia Emanuele
#store all node2vec models
models = []

for i in range(len(graphs)):
    filename = models_dir + files[i] + ".model"
    saved = False
    if(os.path.isfile(filename)):
      saved = True
    if(saved):
      models.append(Word2Vec.load(filename))
      print("Saved model found for file " + filename)
    else:
      print("No saved model found for file " + filename + ", generating embedding...")
      G = node2vec.Graph(graphs[i], directed[i], p, q)
      G.preprocess_transition_probs()
      walks = G.simulate_walks(num_walks, walk_length)
      walks = [list(map(str, walk)) for walk in walks]
      models.append(Word2Vec(walks, window=10, min_count=0, sg=1, workers=100))
      models[i].save(filename)

Clustering with kmeans

In [None]:
#Author: Savoia Emanuele

##Min tolerated sh score to proceed is: [best_score - max_error]
max_error = 0.025

#use sklearn implementation of kmeans++ combined with silhouette score to find best k and clustering
# if sampled is True use MiniBatchKMeans instead of kmeans
def sklearn_kmeans_find_k(points, sampled=True):
  start_time = time.time()
  #best clustering until this iteration
  best_clustering = []
  #array of scores used for plot
  scores = []
  #array of K associated with scores, used for plot
  k_tried = []
  #best silhouette until now
  best_score = -1.0
  #mnimum k value
  min_k = int(math.sqrt(len(points))*4.5)
  for k in range(min_k, len(points), 15):
    print("trying kmeans with k = " + str(k))
    score = -1.0
    clustering = []
    if sampled:
      #choose sample size based on k and vector size
      sample_size = int(len(points)/100)
      if sample_size < int(k*7.75):
        sample_size = int(k*7.75)
        if(sample_size < 1024 and len(points) >= 1024):
          sample_size = 1024
        elif(sample_size > len(points)):
          sample_size = len(points)
      #run kmeans++ with mini batches
      kmeans = MiniBatchKMeans(n_clusters=k, init='k-means++', batch_size=sample_size, n_init='auto')
      clustering = kmeans.fit_predict(points)
    else:
      #run kmeans++
      kmeans = KMeans(n_clusters=k, init='k-means++', n_init='auto')
      clustering = kmeans.fit_predict(points)
    #calculate silhouette from sample
    score = silhouette_score(points, clustering, sample_size=min_k*10)
    print("> Current silhouette score:   " + str(score))
    print("> Last best silhouette score: " + str(best_score))
    #update results and check if within parameters
    if score >= best_score:
      best_clustering = clustering
      best_score = score
      scores.append(score)
      k_tried.append(k)
    elif score >= best_score - max_error:
      scores.append(score)
      k_tried.append(k)
    else:
      print("Time since start: " + str(time.time() - start_time))
      return best_clustering, scores, k_tried

def plot_save_scores(scores, k_tried, title_):
  # Data for plotting
  fig, ax = plt.subplots()
  ax.plot(k_tried, scores)

  ax.set(xlabel='k', ylabel='silhouette score',
        title=title_)
  ax.grid()

  fig.savefig(plots_dir + title_ + ".png")
  plt.show()

#returns the subgraph with target node inside acording to clustering
def reduce_graph_to_cluster_subgraph(graph, directed, c, target):
  clustered = []
  target_cluster = c[target]
  for p in range(len(graph.nodes)):
    if c[p] == target_cluster:
        clustered.append(p)
  clustered_graph = graph.subgraph(clustered)
  print("Clustered graph: " + str(clustered_graph))
  #Remove nodes that are not connected
  if(directed):
    u = clustered_graph.to_undirected()
    nodes = nx.node_connected_component(u, target)
    clustered_graph = clustered_graph.subgraph(nodes)
  else:
    clustered_graph = nx.node_connected_component(clustered_graph, target)
  print("Clustered graph's (weakly) connected component with target node: " + str(clustered_graph))
  return clustered_graph

**Execution**

In [None]:
#Author: Savoia Emanuele

#convert graph embeddings into numpy arrays to be clustered
graph_arrays = []

for i in range(len(models)):
  vectors = []
  for p in range(graphs[i].number_of_nodes()):
      vectors.append(models[i].wv[p])
  graph_arrays.append(np.array(vectors))

In [None]:
#Author: Savoia Emanuele

#Cluster, save execution data and plot
best_c_sampled_arr = []
best_c_unsampled_arr = []
scores_sampled_arr = []
scores_unsampled_arr = []
k_tried_sampled_arr = []
k_tried_unsampled_arr = []

for points in graph_arrays:
  best_c_sampled, scores_sampled, k_tried_sampled = sklearn_kmeans_find_k(points)

  best_c_unsampled, scores_unsampled, k_tried_unsampled = sklearn_kmeans_find_k(points, False)

  best_c_sampled_arr.append(best_c_sampled)
  best_c_unsampled_arr.append(best_c_unsampled)
  scores_sampled_arr.append(scores_sampled)
  scores_unsampled_arr.append(scores_unsampled)
  k_tried_sampled_arr.append(k_tried_sampled)
  k_tried_unsampled_arr.append(k_tried_unsampled)

for i in range(len(graph_arrays)):
  plot_save_scores(scores_sampled_arr[i], k_tried_sampled_arr[i], files[i] + "_sampled")
  plot_save_scores(scores_unsampled_arr[i], k_tried_unsampled_arr[i], files[i] + "_unsampled")

In [None]:
#Author: Savoia Emanuele

targets = []
clustered_graphs = []

for i in range(len(graphs)):
  #Target node chosen at random
  target = random.randrange(graphs[i].number_of_nodes())
  print("Target node is " + str(target) + "for graph " + str(i))
  targets.append(target)
  targets.append(target)
  clustered_graphs.append(reduce_graph_to_cluster_subgraph(graphs[i], directed[i], best_c_sampled_arr[i], target))
  clustered_graphs.append(reduce_graph_to_cluster_subgraph(graphs[i], directed[i], best_c_unsampled_arr[i], target))



Valutazione nodi (usare clustered_graph)

In [None]:
#Author:Vendramin Riccardo
# FUNCTIONS DEFINITION

# Neighbor
def n(v, edges):
    neighbors = set()
    for edge in edges:
        if v in edge:
            neighbors.update(edge)

    neighbors.discard(v)

    return list(neighbors)

# Generate undirected with edge = both arcs
def create_undirected_graph_from_directed(directed_graph):
    undirected_graph = nx.Graph()

    for edge in directed_graph.edges():
        if directed_graph.has_edge(edge[0], edge[1]) and directed_graph.has_edge(edge[1], edge[0]):
            undirected_graph.add_edge(edge[0], edge[1])

    return undirected_graph

# Bron-Kerbosch algorithm with pivot
def BronKerbosch(R, P, X, edges, cliques):
    if not P and not X:
        # P and X are both empty, report R as a maximal clique
        cliques.append(R)
        return

    # Choose a pivot vertex u in P ⋃ X
    pivot = (set(P) | set(X)).pop()

    for v in set(P) - set(n(pivot, edges)):
        # Recursively explore the neighborhood of v
        BronKerbosch(R + [v], list(set(P) & set(n(v, edges))), list(set(X) & set(n(v, edges))), edges, cliques)

        # Remove v from P and add it to X
        P.remove(v)
        X.append(v)

# Max clique
def maxClique(cliques):
    return max(cliques, key=len, default=[])

# Find cliques of target node
def nodeCliques(all_cliques, selected_node):
    node_cliques = []
    for clique in all_cliques:
        if selected_node in clique:
            node_cliques.append(clique)
    return node_cliques

# Compute clustering coefficient for nodes
def runNodeCC(graph):
    node_cluster_coefs = nx.clustering(graph)

    return node_cluster_coefs

# Find clique with best cc
def bestClique(node_cliques, node_cluster_coefs):
    max_cc = 0
    best_clique = []
    for clique in node_cliques:
        avg_cc = 0;
        for node in clique:
            avg_cc += node_cluster_coefs[node]

        print("clique:  " + str(clique))
        print("clique cc:  " + str(avg_cc/len(clique)))

        if avg_cc/len(clique) > max_cc:
            best_clique = clique
    return best_clique

In [None]:

def test_solution(clustered_graph):
  # Numero di nodi da rimuovere ad ogni iterazione
  n_nodes_to_remove = int(random.uniform(1,10)/100 * len(clustered_graph.nodes))

  # Numero totale di iterazioni
  n_iterations = 100

  # Copia del grafo iniziale
  graph_to_modify = graph.copy()

  #results
  results = []

  for iteration in range(n_iterations):
      # Escludere il target
      filtered_nodes = [element for element in list(graph_to_modify.nodes()) if element != selected_node]

      # Rimuovi n nodi casuali dal grafo
      nodes_to_remove = random.sample(filtered_nodes, n_nodes_to_remove)
      updated_graph = graph_to_modify.copy()
      updated_graph.remove_nodes_from(nodes_to_remove)

      # Stampa o elabora i risultati desiderati
      print(f"Iteration {iteration + 1}: Removed nodes {nodes_to_remove}")

      all_cliques = []
      updated_graph = create_undirected_graph_from_directed(updated_graph)
      BronKerbosch([], list(updated_graph.nodes), [], updated_graph.edges, all_cliques)
      node_cliques = nodeCliques(all_cliques, selected_node)
      print("Cliques nodo:")
      print(node_cliques)
      best_clique = bestClique(node_cliques, runNodeCC(updated_graph))
      print("Best clique")
      print(best_clique)
      results.append(best_clique)

      # Aggiorna il grafo originale per la prossima iterazione
      #graph_to_modify = updated_graph.copy()
  return results

In [None]:
#plot results

def generate_histogram(array_of_arrays, title, found):
    # Convert inner arrays to tuples and use Counter
    element_counts = Counter(tuple(inner_array) for inner_array in array_of_arrays)

    # Extract elements and their counts
    elements = list(element_counts.keys())
    counts = list(element_counts.values())

    # Plot the histogram
    plt.bar(range(len(elements)), counts, align='center')
    plt.xticks(range(len(elements)), elements)
    plt.xlabel('Cliques')
    plt.ylabel('Count')
    plt.title('Histogram obtained, out result: ' + str(found))

    # Annotate each bar with its count
    for i, count in enumerate(counts):
        plt.text(i, count + 0.1, str(count), ha='center', va='bottom')

    plt.savefig(plots_dir + title + '.png')
    plt.show()
    

In [None]:
#Author:Vendramin Riccardo

# Find target's best clique
for i in range(len(clustered_graphs)):
  all_cliques = []
  selected_node = 12
  graph = create_undirected_graph_from_directed(clustered_graphs[i])
  BronKerbosch([], list(graph.nodes), [], graph.edges, all_cliques)
  #print(all_cliques)
  node_cliques = nodeCliques(all_cliques, selected_node)
  print("Cliques nodo:")
  print(node_cliques)
  print("Best clique")
  best_cl = bestClique(node_cliques, runNodeCC(graph))
  print(best_cl)
  results = test_solution(graph)
  end = ""
  if i%2 == 0:
    end = "sampled"
  else:
    end = "unsampled"
  generate_histogram(results, "Result-histogram-" + files[int(i/2)] + "-", best_cl)