In [None]:
import networkx as nx

# Create a directional graph
G = nx.read_edgelist('congress.edgelist', create_using=nx.DiGraph())

<networkx.classes.digraph.DiGraph at 0x7ff5ff390860>

In [4]:
# Initialize the hubs and authorities
hubs = {node: 1.0 for node in G.nodes()}
authorities = {node: 1.0 for node in G.nodes()}

# Set the number of iterations
num_iterations = 100

for _ in range(num_iterations):
    # Update authorities
    new_authorities = {node: 0.0 for node in G.nodes()}
    for node in G.nodes():
        for neighbor in G.predecessors(node):
            new_authorities[node] += hubs[neighbor]
    
    # Normalize authorities
    norm = sum(new_authorities.values())
    for node in new_authorities:
        new_authorities[node] /= norm
    
    # Update hubs
    new_hubs = {node: 0.0 for node in G.nodes()}
    for node in G.nodes():
        for neighbor in G.successors(node):
            new_hubs[node] += new_authorities[neighbor]
    
    # Normalize hubs
    norm = sum(new_hubs.values())
    for node in new_hubs:
        new_hubs[node] /= norm
    
    hubs = new_hubs
    authorities = new_authorities

print("Hubs:", hubs)
print("Authorities:", authorities)

Hubs: {'0': 0.0009851917010086659, '4': 0.0018337527126278791, '12': 0.001207711089373623, '18': 0.0007753165076147423, '25': 0.0024795112868950383, '30': 0.0021045396435460345, '46': 0.0018839189833571835, '55': 0.001683399558027719, '58': 0.002162914056101447, '59': 0.0020659494628655936, '74': 0.001079833388742128, '76': 0.0019187767756666252, '77': 0.001584680038957044, '85': 0.0012264297087960995, '86': 0.0017176089017746571, '87': 0.004216029427906556, '154': 0.003072242877288125, '168': 0.0007442974879174176, '341': 0.002763916486962609, '374': 0.0020363352249333565, '401': 0.002428772437707398, '3': 0.0028339400548015917, '14': 0.001345891206580611, '17': 0.0027348658937600408, '24': 0.00259446167799016, '27': 0.0020338450131804423, '64': 0.0016547794044967411, '79': 0.0013284745425182038, '84': 0.0017492322722284235, '88': 0.0016736848476156265, '89': 0.001205572126314971, '149': 0.0031183741164794456, '179': 0.005546182186376116, '197': 0.0015958872514130007, '213': 0.0003824

In [None]:
pagerank = {node: 1.0 / len(G) for node in G.nodes()}

damping_factor = 0.85
num_iterations = 100

for _ in range(num_iterations):
    new_pagerank = {node: (1 - damping_factor) / len(G) for node in G.nodes()}
    for node in G.nodes():
        for neighbor in G.predecessors(node):
            new_pagerank[node] += damping_factor * pagerank[neighbor] / len(list(G.successors(neighbor)))
    
    pagerank = new_pagerank

print("PageRank:", pagerank)

In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of Hub scores
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(hubs.values(), bins=30, color='blue', alpha=0.7)
plt.title('Distribution of Hub Scores')
plt.xlabel('Hub Score')
plt.ylabel('Frequency')

# Plot the distribution of Authority scores
plt.subplot(1, 2, 2)
plt.hist(authorities.values(), bins=30, color='green', alpha=0.7)
plt.title('Distribution of Authority Scores')
plt.xlabel('Authority Score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of Hub scores
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(hubs.values(), bins=30, color='blue', alpha=0.7)
plt.title('Distribution of Hub Scores')
plt.xlabel('Hub Score')
plt.ylabel('Frequency')

# Plot the distribution of Authority scores
plt.subplot(1, 2, 2)
plt.hist(authorities.values(), bins=30, color='green', alpha=0.7)
plt.title('Distribution of Authority Scores')
plt.xlabel('Authority Score')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
from scipy.stats import pearsonr

# Assuming pagerank is already defined in the previous cells
hub_scores = list(hubs.values())
authority_scores = list(authorities.values())
pagerank_scores = list(pagerank.values())

# Compute Pearson's correlation coefficient between hub scores and PageRank scores
hub_pagerank_corr, _ = pearsonr(hub_scores, pagerank_scores)
print("Pearson's correlation coefficient between hub scores and PageRank scores:", hub_pagerank_corr)

# Compute Pearson's correlation coefficient between authority scores and PageRank scores
authority_pagerank_corr, _ = pearsonr(authority_scores, pagerank_scores)
print("Pearson's correlation coefficient between authority scores and PageRank scores:", authority_pagerank_corr)

In [None]:
import networkx as nx

def girvan_newman(G, num_communities):
    # Make a copy of the graph to avoid modifying the original graph
    G_copy = G.copy()
    
    # Function to compute the edge betweenness centrality
    def edge_to_remove(G):
        edge_betweenness = nx.edge_betweenness_centrality(G)
        return max(edge_betweenness, key=edge_betweenness.get)
    
    # List to store the communities
    communities = list(nx.connected_components(G_copy))
    
    while len(communities) < num_communities:
        # Remove the edge with the highest betweenness centrality
        G_copy.remove_edge(*edge_to_remove(G_copy))
        # Recompute the communities
        communities = list(nx.connected_components(G_copy))
    
    return communities

# Number of communities desired
num_communities = 5

# Apply the Girvan-Newman algorithm
communities = girvan_newman(G, num_communities)

# Print the communities
for i, community in enumerate(communities):
    print(f"Community {i+1}: {community}")

In [None]:
import networkx as nx
from networkx.algorithms.community import girvan_newman
from networkx.drawing.nx_agraph import graphviz_layout

import matplotlib.pyplot as plt

# Load Zachary's karate club graph
karate_club_graph = nx.karate_club_graph()

# Apply the Girvan-Newman algorithm
comp = girvan_newman(karate_club_graph)

# Get the first level of communities
communities = next(comp)

# Number of communities
num_communities = len(communities)
print(f"Number of communities: {num_communities}")

# Distribution of community sizes
community_sizes = [len(c) for c in communities]
print(f"Distribution of community sizes: {community_sizes}")

# Visualize the community structure
pos = nx.spring_layout(karate_club_graph)
colors = ['r', 'g', 'b', 'y', 'c', 'm']

for i, community in enumerate(communities):
    nx.draw_networkx_nodes(karate_club_graph, pos, nodelist=community, node_color=colors[i % len(colors)], label=f'Community {i+1}')
nx.draw_networkx_edges(karate_club_graph, pos)
nx.draw_networkx_labels(karate_club_graph, pos)
plt.legend()
plt.title("Community structure in Zachary's karate club network")
plt.show()

# Visualize the hierarchical tree

# Create a new graph to represent the hierarchical tree
hierarchical_tree = nx.Graph()

# Add nodes and edges to the hierarchical tree
for i, communities in enumerate(comp):
    for community in communities:
        hierarchical_tree.add_node(f"Level {i+1} - Community {list(communities).index(community)+1}", size=len(community))
        for node in community:
            hierarchical_tree.add_edge(node, f"Level {i+1} - Community {list(communities).index(community)+1}")

# Draw the hierarchical tree
pos = graphviz_layout(hierarchical_tree, prog='dot')
sizes = [hierarchical_tree.nodes[node]['size']*100 for node in hierarchical_tree.nodes()]
nx.draw(hierarchical_tree, pos, with_labels=True, node_size=sizes, node_color='lightblue', font_size=8)
plt.title("Hierarchical tree of communities")
plt.show()

In [None]:
import os
import tarfile
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score
from node2vec import Node2Vec
import numpy as np

import urllib.request
import matplotlib.pyplot as plt

# Step 1: Download and extract the dataset
url = 'https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz'
dataset_dir = 'cora_dataset'
if not os.path.exists(dataset_dir):
    os.makedirs(dataset_dir)
    urllib.request.urlretrieve(url, os.path.join(dataset_dir, 'cora.tgz'))
    with tarfile.open(os.path.join(dataset_dir, 'cora.tgz'), 'r:gz') as tar:
        tar.extractall(path=dataset_dir)

# Step 2: Build the network from cora.cites file
cites_file = os.path.join(dataset_dir, 'cora.cites')
G = nx.read_edgelist(cites_file, create_using=nx.DiGraph(), nodetype=int)

# Step 3: Read the class labels from cora.content file
content_file = os.path.join(dataset_dir, 'cora.content')
content = pd.read_csv(content_file, sep='\t', header=None)
content.columns = ['node'] + [f'feat_{i}' for i in range(content.shape[1] - 2)] + ['label']
node_labels = content[['node', 'label']].set_index('node').to_dict()['label']

# Step 4: Apply the node2vec algorithm to generate node embeddings
def generate_node2vec_embeddings(G, dimensions=128, walk_length=80, num_walks=10, p=1, q=1):
    node2vec = Node2Vec(G, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, p=p, q=q, workers=4)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    embeddings = {str(node): model.wv[str(node)] for node in G.nodes()}
    return embeddings

embeddings = generate_node2vec_embeddings(G)

# Step 5: Use logistic regression classifier on the obtained node embeddings
X = np.array([embeddings[str(node)] for node in G.nodes()])
y = np.array([node_labels[node] for node in G.nodes()])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

macro_f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)

print(f"Macro-F1 Score: {macro_f1}")
print(f"Accuracy: {accuracy}")

# Step 6: Plot the required graphs
def plot_metric_vs_parameter(metric, parameter_values, parameter_name, log_scale=False):
    plt.figure(figsize=(10, 6))
    plt.plot(parameter_values, metric, marker='o')
    plt.xlabel(parameter_name)
    plt.ylabel('Metric')
    if log_scale:
        plt.xscale('log', basex=2)
    plt.title(f'Metric vs {parameter_name}')
    plt.grid(True)
    plt.show()

# Example: Plot Macro-F1 vs log2(p)
p_values = [0.25, 0.5, 1, 2, 4]
macro_f1_scores = []

for p in p_values:
    embeddings = generate_node2vec_embeddings(G, p=p)
    X = np.array([embeddings[str(node)] for node in G.nodes()])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    macro_f1_scores.append(f1_score(y_test, y_pred, average='macro'))

plot_metric_vs_parameter(macro_f1_scores, p_values, 'log2(p)', log_scale=True)

# Repeat similar steps for other parameters and metrics