## Load Dataset

In [3]:
import networkx as nx
import os
def load_network(edge_list_file):
    """
    Load a network from an edge list file.
    
    Args:
        edge_list_file (str): Path to the edge list file
        
    Returns:
        networkx.Graph: The loaded network graph
    """
    try:
        # Check if file exists and has content
        if not os.path.exists(edge_list_file) or os.path.getsize(edge_list_file) == 0:
            raise FileNotFoundError(f"Edge list file not found or empty: {edge_list_file}")
        
        # Load the network
        G = nx.read_edgelist(edge_list_file)
        print(f"Network loaded from {edge_list_file}")
        print(f"  Number of nodes: {G.number_of_nodes()}")
        print(f"  Number of edges: {G.number_of_edges()}")
        
        return G
    
    except Exception as e:
        print(f"Error in load_network: {str(e)}")
        raise

# Load the network
edge_list_file_vec = "./network_similarity_vec.txt"
G = load_network(edge_list_file_vec)

Network loaded from ./network_similarity_vec.txt
  Number of nodes: 35596
  Number of edges: 58059686


## Analyze Network

In [16]:
def analyze_network(G):
    """
    Perform basic analysis on a network graph.
    
    Args:
        G (networkx.Graph): The network graph to analyze
        
    Returns:
        networkx.Graph: The analyzed network graph
    """
    try:
        print(f"\nNetwork Analysis:")
        
        # Connected components analysis
        num_components = nx.number_connected_components(G)
        print(f"  Number of connected components: {num_components}")
        
        largest_cc = None
        if num_components > 0:
            # Get largest connected component
            largest_cc = max(nx.connected_components(G), key=len)
            largest_cc_size = len(largest_cc)
            print(f"  Size of largest connected component: {largest_cc_size} nodes")
            print(f"  Percentage of nodes in largest component: {largest_cc_size/G.number_of_nodes()*100:.2f}%")
        
        return G, largest_cc
    
    except Exception as e:
        print(f"Error in analyze_network: {str(e)}")
        raise

# Analyze the network
G, Giant = analyze_network(G)


Network Analysis:
  Number of connected components: 62
  Size of largest connected component: 7960 nodes
  Percentage of nodes in largest component: 22.36%


In [None]:
# TODO: add all stat analysis to analyze_network
# TODO: Make methods for centrality, etc

In [21]:
g_cc = G.subgraph(Giant).copy()
print("Giant graph made")

# Network diameter
diameter = nx.diameter(g_cc)
print(f"Network diameter: {diameter:.4f}")

# 6 minutes to make g_cc
# Cell finishes in 


KeyboardInterrupt



In [None]:
# Network density
density = nx.density(G)
print(f"Network density: {density:.4f}")

# Network shortest path
shortest_path_avg = nx.average_shortest_path_length(G)
print(f"Network shortest average path: {shortest_path_avg:.4f}")

### Centrality Analysis

In [None]:
degree_centralities = nx.degree_centrality(G)
closeness_centralities = nx.closeness_centrality(G)
betweenness_centralities = nx.betweenness_centrality(G)

# TODO: make graphs

### Clustering analysis

In [None]:
print(nx.average_clustering(G))
clustering_coefficients = nx.clustering(G)

print(min(clustering_coefficients.values()))
print(max(clustering_coefficients.values()))

### Degree Distribution Analysis

In [None]:
# Small world and heavy-tail analysis

degrees = dict(nx.degree(G))
print(sum(degrees.values())) # Total degree
print(sum(degrees.values())/nx.number_of_nodes(G)) # Average degree

degree_sequence = sorted(degrees.values(), reverse=True)
# TODO: plot degree sequences

In [None]:
import matplotlib.pyplot as plt

plt.loglog(degree_sequence,'b-',marker='o')
plt.title('Degree rank plot (loglog)')
plt.ylabel('Degree')
plt.xlabel('Rank')
plt.show()

plt.plot(degree_sequence,'b-',marker='o')
plt.title('Degree rank plot (linear)')
plt.ylabel('Degree')
plt.xlabel('Rank')
plt.show()

### Random Model Comparison