In [40]:
import pandas as pd
import numpy as np
import igraph as ig
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import random

In [None]:
classes_path = "../elliptic_bitcoin_dataset/elliptic_txs_classes.csv"
edges_path = "../elliptic_bitcoin_dataset/elliptic_txs_edgelist.csv"
features_path = "../elliptic_bitcoin_dataset/elliptic_txs_features.csv"

classes = pd.read_csv(classes_path)
edges = pd.read_csv(edges_path)
feat_cols = ['txId', 'time_step'] + [f'trans_feat_{i}' for i in range(93)] + [f'agg_feat_{i}' for i in range(72)]
feats = pd.read_csv(features_path, header=None, names=feat_cols)
classes.columns = ['txId', 'label']
df = classes.set_index('txId').join(feats.set_index('txId'))
df.head(10)

In [None]:
label_colors = {'1':'red', '2':'green', 'unknown':'white'}

tx_graph = ig.Graph(directed=False)

edges_dict = {tx_id: i for i, tx_id in enumerate(classes['txId'])}

tx_graph.add_vertices(len(classes))
tx_graph.vs['id'] = list(classes['txId'])
tx_graph.vs['type'] = list(classes['label'])
tx_graph.vs['time_step'] = list(feats['time_step'])
tx_graph.vs['color'] = [label_colors[label] for label in classes['label']]

edges_list = [(edges_dict[edges['txId1'][i]], edges_dict[edges['txId2'][i]]) for i in tqdm(range(len(edges)))]
tx_graph.add_edges(edges_list)


In [None]:
# Leggi i risultati dal file 'results.txt' e trova i 50 valori più alti
values = []
import statistics

# Lista per memorizzare i valori
values_var = []

# Leggi i valori dal file
with open('../data/shortest_path_exact.txt', 'r') as file:
    for line in file:
        # Estrai il valore dalla riga
        parts = line.strip().split(': ')
        value = float(parts[1])  # Converti il valore in float
        values_var.append(value)

# Calcola la varianza
average = statistics.mean(values_var)
variance = statistics.variance(values_var)

variance_updated = 0
with open('../data/shortest_path_exact.txt', 'r') as file:
    for line in file:
        # Estrai il valore dalla riga
        parts = line.strip().split(': ')
        value = float(parts[1])  # Converti il valore in float
        variance_updated += ((value-average)*(value-average))

# Stampa la varianza
print(average)
print(f"La varianza dei risultati è: {variance}")
print("La varianza ricalcolata",variance_updated/len(values_var))

with open('../data/shortest_path_exact.txt', 'r') as file:
    for line in file:
        # Split della riga per ottenere il nodo e il valore
        parts = line.strip().split(': ')
        node = int(parts[0].split()[1])  # Estrai il nodo dal formato "Node X"
        value = float(parts[1])  # Converti il valore in float

        # Aggiungi una tupla (nodo, valore) alla lista
        values.append((node, value))

# Ordina i valori in ordine decrescente
values.sort(key=lambda x: x[1], reverse=True)

# Prendi i primi 50 valori più alti
top_50 = values[:5000]

# Stampa i 50 valori più alti


In [None]:
approximated_variance = []
max_path = float('-inf')
def approximated_mean_shortest_path_igraph(graph, sample_size_percentage):
    mean_shortest_paths = {}
    global max_path
    # Total number of nodes in the graph
    nodes = list(range(graph.vcount()))
    sample_size = int(len(nodes)*sample_size_percentage/100)
    print("Coreset size:",sample_size)
    count=0
    # Sample a subset of nodes to approximate the mean
    sampled_nodes = nodes if len(nodes) <= sample_size else random.sample(nodes, sample_size)

    for node in nodes:
        print(count)
        count +=1

        # Calculate shortest paths from the current node to sampled nodes
        distances = graph.shortest_paths(source=node, target=sampled_nodes)[0]

        # Exclude unreachable nodes (distance == infinity)
        reachable_distances = [d for d in distances if d != float('inf')]

        # Compute mean shortest path for the node
        if reachable_distances:
            mean_shortest_paths[node] = sum(reachable_distances) / len(reachable_distances)
            max_shortest=max(reachable_distances)
            if max_shortest>max_path:
                max_path=max_shortest
            approximated_variance.append(statistics.variance(reachable_distances))
        else:
            mean_shortest_paths[node] = float('inf')  

    return mean_shortest_paths

results = approximated_mean_shortest_path_igraph(tx_graph, 100)

In [None]:
print("maximum variance between the ones of the single nodes:", max(approximated_variance))
print("mean variance (can't be used for our inequality:", statistics.mean(approximated_variance))
print("longhest path:", max_path)

In [46]:
with open('../data/shortest_path_10.txt', 'w') as file:
    for node, mean in results.items():
        file.write(f"Node {node}: {mean}\n")