In [None]:
import os
import pickle
import networkx as nx
import scipy.sparse as sp

##############################

def calculate_metrics(graph, graph_file):
    num_nodes = graph.number_of_nodes()
    avg_degree = sum(dict(graph.degree()).values()) / num_nodes
    avg_betweenness_centrality = sum(nx.betweenness_centrality(graph).values()) / num_nodes
    density = nx.density(graph)
    assortativity = nx.degree_assortativity_coefficient(graph)
    average_shortest_path_length = nx.average_shortest_path_length(graph)
    diameter = nx.diameter(graph)
    avg_closeness = sum(nx.closeness_centrality(graph).values()) / num_nodes
    
    return {
        'Graph File': os.path.basename(graph_file),
        'Num Nodes': num_nodes,
        'Avg Degree': avg_degree,
        'Avg Betweenness Centrality': avg_betweenness_centrality,
        'Density': density,
        'Assortativity': assortativity,
        'Avg Shortest Path Length': average_shortest_path_length,
        'Diameter': diameter,
        'Avg Closeness': avg_closeness
    }

def process_graph(graph_file, output_dir):
    try:
        # Load the connected graph from the file using pickle
        with open(graph_file, 'rb') as f:
            graph_data = pickle.load(f)
    except (pickle.PickleError, FileNotFoundError) as e:
        print(f"Skipping {graph_file}: Error loading pickle file: {str(e)}")
        return

    # Convert the loaded data to a MultiDiGraph object
    graph = nx.MultiDiGraph(graph_data)

    num_nodes = graph.number_of_nodes()
    
    if num_nodes > x or num_nodes < y: # based on your purpose and computational power, select min and max num. of nodes
        return

    # Remove self-loops and duplicate edges
    graph = nx.DiGraph(graph)
    graph.remove_edges_from(nx.selfloop_edges(graph))
    graph = nx.Graph(graph)

    if not nx.is_connected(graph):
        largest_connected_component = max(nx.connected_components(graph), key=len)
        graph = graph.subgraph(largest_connected_component)

    # Compute network metrics for the connected graph
    metrics = calculate_metrics(graph, graph_file)

    metrics_save_path = os.path.join(output_dir, "network_metrics.pkl")
    if os.path.exists(metrics_save_path):
        with open(metrics_save_path, 'rb') as f:
            existing_metrics = pickle.load(f)
        existing_metrics.append(metrics)
        with open(metrics_save_path, 'wb') as f:
            pickle.dump(existing_metrics, f)
    else:
        with open(metrics_save_path, 'wb') as f:
            pickle.dump([metrics], f)

    print(f"Metrics for {os.path.basename(graph_file)} saved successfully.")
    
    # Compute network matrices for the connected graph
    adjacency_matrix = nx.adjacency_matrix(graph)
    adjacency_matrix_sparse = sp.csr_matrix(adjacency_matrix)

    network_matrices_save_path = os.path.join(output_dir, "network_matrices.pkl")
    if os.path.exists(network_matrices_save_path):
        with open(network_matrices_save_path, 'rb') as f:
            existing_network_matrices = pickle.load(f)
        existing_network_matrices.append(adjacency_matrix_sparse)
        with open(network_matrices_save_path, 'wb') as f:
            pickle.dump(existing_network_matrices, f)
    else:
        with open(network_matrices_save_path, 'wb') as f:
            pickle.dump([adjacency_matrix_sparse], f)

    print(f"Network matrix for {os.path.basename(graph_file)} saved successfully.")

def main():
    input_dir = r"" 
    output_dir = r""

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith(".pkl"):
            graph_file = os.path.join(input_dir, filename)
            process_graph(graph_file, output_dir)

if __name__ == "__main__":
    main()
