## Importa Grafo do Neo4j e converte para NetworkX

In [1]:
import networkx as nx
from neo4j import GraphDatabase

URI = "bolt://localhost:7687"

password = "12345678" # CHANGE
AUTH = ("neo4j", password)

In [2]:
def get_graph_from_neo4j(driver):
    G = nx.DiGraph()
    with driver.session() as session:
        nodes_result = session.run("""
            MATCH (n)
            RETURN id(n) AS id, labels(n) AS labels, properties(n) AS properties
        """)
        for record in nodes_result:
            G.add_node(record["id"], labels=record["labels"], **record["properties"])

        rels_result = session.run("""
            MATCH (n)-[r]->(m)
            RETURN id(n) AS source, id(m) AS target, type(r) AS type
        """)
        for record in rels_result:
            G.add_edge(record["source"], record["target"], type=record["type"])
    return G

## Visualização de Métricas

In [3]:
# Auxiliary function to get top 5 nodes for each metric
def get_top_5(metric_dict):
    sorted_items = sorted(metric_dict.items(), key=lambda x: x[1], reverse=True)
    return sorted_items[:5]

In [4]:
def get_users_relation_metrics(U, graph_name):
    results = {
        'graph_name': graph_name,
        'num_nodes': U.number_of_nodes(),
        'num_edges': U.number_of_edges(),
        'density': None,
        'diameter': None,
        'giant_component_pct': None,
        'metrics': {}
    }

    components = list(nx.connected_components(U))
    giant_component_nodes = max(components, key=len)
    G_giant = U.subgraph(giant_component_nodes)

    results['giant_component_pct'] = (G_giant.number_of_nodes() / U.number_of_nodes()) * 100
    results['density'] = nx.density(G_giant)
    if nx.is_connected(G_giant):
        results['diameter'] = nx.diameter(G_giant)
    else:
        results['diameter'] = None

    # Degree Centrality: Connectivity of the user
    degree_centrality = nx.degree_centrality(U)
    results['metrics']['degree_centrality'] = get_top_5(degree_centrality)

    # Clustering Coefficient: Bubbles/niches
    clustering_coefficient = nx.clustering(U)
    results['metrics']['clustering_coefficient'] = get_top_5(clustering_coefficient)

    # Closeness Centrality: Proximity to other users
    closeness_centrality = nx.closeness_centrality(G_giant)
    results['metrics']['closeness_centrality'] = get_top_5(closeness_centrality)

    # Betweenness Centrality: Users who may act as bridges
    k_approx = min(200, U.number_of_nodes()) # Uses a sample of nodes for performance
    betweenness_centrality = nx.betweenness_centrality(U, k=k_approx, normalized=True)
    results['metrics']['betweenness_centrality'] = get_top_5(betweenness_centrality)

    return results

In [5]:
import pandas as pd

def print_metrics_table(all_results):
    summary_data = []
    for result in all_results:
        summary_data.append({
            'Graph': result['graph_name'],
            'Nodes': result['num_nodes'],
            'Edges': result['num_edges'],
            'Giant Component %': f"{result['giant_component_pct']:.2f}%" if result['giant_component_pct'] else "N/A",
            'Density': f"{result['density']:.6f}" if result['density'] else "N/A",
            'Diameter': result['diameter'] if result['diameter'] else "N/A"
        })

    summary_df = pd.DataFrame(summary_data)
    print(summary_df.to_string(index=False))
    print()

    for result in all_results:
        if result['num_nodes'] == 0:
            print(f"\n{result['graph_name']}: Empty graph")
            continue

        print("\n" + "="*80)
        print(f"{result['graph_name'].upper()} - DETAILED METRICS")
        print("="*80)

        for metric_name, top_5 in result['metrics'].items():
            print(f"\n{metric_name.upper().replace('_', ' ')}:")
            print("-" * 80)

            metric_data = []
            for rank, (user_id, value) in enumerate(top_5, 1):
                metric_data.append({
                    'Rank': rank,
                    'User ID': user_id,
                    'Value': f"{value:.6f}"
                })

            metric_df = pd.DataFrame(metric_data)
            print(metric_df.to_string(index=False))
            print()

In [6]:
import traceback

G_shares = nx.Graph()
G_viral = nx.Graph()
G_misinfo = nx.Graph()

try:
    driver = GraphDatabase.driver(URI, auth=AUTH)

    print("Connecting to Neo4j and building NetworkX graph...")

    networkx_graph = get_graph_from_neo4j(driver)

    print("Graph imported into NetworkX")

    user_nodes = {n for n, d in networkx_graph.nodes(data=True) if 'User' in d.get('labels', [])}

    # Add all user nodes to each graph
    for u_node in user_nodes:
        G_shares.add_node(u_node, **networkx_graph.nodes[u_node])
        G_viral.add_node(u_node, **networkx_graph.nodes[u_node])
        G_misinfo.add_node(u_node, **networkx_graph.nodes[u_node])

    for u, v, data in networkx_graph.edges(data=True):
        if u in user_nodes and v in user_nodes:
            edge_type = data.get('type')
            if edge_type == 'SHARES':
                G_shares.add_edge(u, v, **data)
            elif edge_type == 'VIRAL_SHARES':
                G_viral.add_edge(u, v, **data)
            elif edge_type == 'SHARES_MISINFORMATION':
                G_misinfo.add_edge(u, v, **data)

    results = [
        get_users_relation_metrics(G_shares, "Shares"),
        get_users_relation_metrics(G_viral, "Viral Shares"),
        get_users_relation_metrics(G_misinfo, "Misinformation Shares")
    ]

    print()
    print_metrics_table(results)

except Exception:
    traceback.print_exc()
finally:
    if 'driver' in locals() and driver:
        driver.close()
        print("Connection closed.")

Connecting to Neo4j and building NetworkX graph...




Graph imported into NetworkX

                Graph  Nodes  Edges Giant Component %  Density  Diameter
               Shares  10366  72869            30.59% 0.014406        10
         Viral Shares  10366     78             0.14% 0.857143         2
Misinformation Shares  10366   7844             8.29% 0.021123         9


SHARES - DETAILED METRICS

DEGREE CENTRALITY:
--------------------------------------------------------------------------------
 Rank  User ID    Value
    1   198274 0.081910
    2   198228 0.063676
    3   198418 0.062132
    4   198308 0.055572
    5   198550 0.052581


CLUSTERING COEFFICIENT:
--------------------------------------------------------------------------------
 Rank  User ID    Value
    1   198153 1.000000
    2   198159 1.000000
    3   198182 1.000000
    4   198187 1.000000
    5   198192 1.000000


CLOSENESS CENTRALITY:
--------------------------------------------------------------------------------
 Rank  User ID    Value
    1   198274 0.535563
 

In [7]:
def prepare_graph_for_gexf(graph):
    for node, data in graph.nodes(data=True):
        if 'labels' in data and isinstance(data['labels'], list):
            data['labels'] = ','.join(data['labels'])

    for u, v, data in graph.edges(data=True):
        if 'type' in data:
            data['label'] = data.pop('type')

    return graph

G_shares = prepare_graph_for_gexf(G_shares)
G_viral = prepare_graph_for_gexf(G_viral)
G_misinfo = prepare_graph_for_gexf(G_misinfo)

nx.write_gexf(G_shares, "../visualization/shares.gexf")
nx.write_gexf(G_viral, "../visualization/viral_shares.gexf")
nx.write_gexf(G_misinfo, "../visualization/misinfo_shares.gexf")
print("Export complete.")

Export complete.


## Prevendo Conexões

In [8]:
from sklearn.metrics import roc_auc_score
import random

def evaluate_predictors_on_graph(U, network_name):
    print("\n" + "="*60)
    print(f"ACCURACY EVALUATION FOR: {network_name}")
    print("="*60)

    original_edges = list(U.edges())
    random.shuffle(original_edges)

    test_size = int(len(original_edges) * 0.10)
    test_edges = original_edges[:test_size]
    train_edges = original_edges[test_size:]

    G_train = nx.Graph()
    G_train.add_nodes_from(U.nodes(data=True))
    G_train.add_edges_from(train_edges)

    print(f"Original network: {U.number_of_nodes()} nodes, {U.number_of_edges()} edges.")
    print(f"Training graph: {G_train.number_of_nodes()} nodes, {G_train.number_of_edges()} edges.")
    print(f"Test set (hidden edges): {len(test_edges)} edges.")


    # Set of negative samples for AUC calculation
    non_edges = list(nx.non_edges(G_train))
    sampled_non_edges = random.sample(non_edges, len(test_edges))

    evaluation_edges = test_edges + sampled_non_edges

    scores = {'common_neighbors': [len(list(nx.common_neighbors(G_train, u, v)))
                                  for u, v in evaluation_edges],
              'jaccard': [s for u, v, s in nx.jaccard_coefficient(G_train, evaluation_edges)],
              'adamic_adar': [s for u, v, s in nx.adamic_adar_index(G_train, evaluation_edges)],
              'pref_attachment': [s for u, v, s in nx.preferential_attachment(G_train, evaluation_edges)]}

    # Katz Proxy Calculation
    katz_centrality = nx.katz_centrality(G_train, alpha=0.005, beta=1.0)
    scores['katz_proxy'] = [(katz_centrality.get(u, 0) + katz_centrality.get(v, 0)) for u,v in evaluation_edges]

    # Graph Distance
    scores['shortest_path_distance'] = []
    for u, v in evaluation_edges:
        try:
            distance = nx.shortest_path_length(G_train, u, v)
            shortest_path_score = 1 / (1 + distance)
        except nx.NetworkXNoPath:
            shortest_path_score = 0
        scores['shortest_path_distance'].append(shortest_path_score)

    y_true = [1] * len(test_edges) + [0] * len(sampled_non_edges)

    results = {}
    for predictor_name, predicted_scores in scores.items():
        # roc score: see if predictors can correctly guess that the hidden edge is more likely to have an edge than non edge
        auc = roc_auc_score(y_true, predicted_scores)
        results[predictor_name] = auc

    print("\nAUC Scores")
    results_df = pd.DataFrame.from_dict(results, orient='index', columns=['AUC'])
    print(results_df.sort_values(by='AUC', ascending=False))

In [9]:
evaluate_predictors_on_graph(G_shares, "Shares")


ACCURACY EVALUATION FOR: Shares
Original network: 10366 nodes, 72869 edges.
Training graph: 10366 nodes, 65583 edges.
Test set (hidden edges): 7286 edges.

AUC Scores
                             AUC
adamic_adar             0.992346
common_neighbors        0.992009
jaccard                 0.991844
pref_attachment         0.986747
shortest_path_distance  0.979343
katz_proxy              0.959491


In [10]:
evaluate_predictors_on_graph(G_viral, "Viral Shares")



ACCURACY EVALUATION FOR: Viral Shares
Original network: 10366 nodes, 78 edges.
Training graph: 10366 nodes, 71 edges.
Test set (hidden edges): 7 edges.

AUC Scores
                        AUC
common_neighbors        1.0
jaccard                 1.0
adamic_adar             1.0
pref_attachment         1.0
katz_proxy              1.0
shortest_path_distance  1.0


In [11]:
evaluate_predictors_on_graph(G_viral, "Misinformation Shares")


ACCURACY EVALUATION FOR: Misinformation Shares
Original network: 10366 nodes, 78 edges.
Training graph: 10366 nodes, 71 edges.
Test set (hidden edges): 7 edges.

AUC Scores
                        AUC
common_neighbors        1.0
jaccard                 1.0
adamic_adar             1.0
pref_attachment         1.0
katz_proxy              1.0
shortest_path_distance  1.0


#### The results were too positive, which actually raises a red flag. However, in this case, we’re not looking at accuracy, since there’s no threshold defined, only comparing whether the models tend to predict that, between a hidden edge and a non-existent one, they are more likely to classify the hidden edge as connected.