In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import csr_matrix
import networkx as nx
from pyvis.network import Network

# Load data
posts = pd.read_csv('../physics_csv/Posts_with_vote_counts.csv')
posts['Tags_clean'] = posts['Tags'].apply(lambda x: x.strip('|').split('|'))

# Use full dataset (or adjust sample size as needed)
data_to_use = posts  # All rows
print(f"Total posts: {len(data_to_use)}")

# Create sparse tag matrix
mlb = MultiLabelBinarizer(sparse_output=True)
tag_matrix_sparse = mlb.fit_transform(data_to_use['Tags_clean'])

# Compute co-occurrence matrix (sparse)
co_occurrence_sparse = tag_matrix_sparse.T @ tag_matrix_sparse
co_occurrence_sparse.setdiag(0)
print("Sparse co-occurrence matrix created.")
print(f"Unique tags: {len(mlb.classes_)}")
print(f"Matrix shape: {co_occurrence_sparse.shape}, Non-zero entries: {co_occurrence_sparse.nnz}")

# Convert to COO format
coo = co_occurrence_sparse.tocoo()
# Ensure edge filtering to reduce graph size
EDGE_THRESHOLD = 1000
edges = [(i, j, v) for i, j, v in zip(coo.row, coo.col, coo.data) if v >= EDGE_THRESHOLD]
print(f"Filtered {len(edges)} edges with weight >= {EDGE_THRESHOLD}")

# Build graph
G = nx.Graph()
for i, j, w in edges:
    G.add_edge(str(mlb.classes_[i]), str(mlb.classes_[j]), weight=int(w), value=int(w))

print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

# Create PyVis network
net = Network(notebook=False, height="1000px", width="100%")
net.from_nx(G)

for node in net.nodes:
    node['size'] = 15
    node['title'] = node['id']

for edge in net.edges:
    edge['title'] = f"Weight: {edge.get('value', 'N/A')}"
    edge['width'] = edge.get('value', 1) * 0.1

# Save HTML
output_file = "tag_cooccurrence_network_full.html"
net.write_html(output_file)
print(f"Graph saved as {output_file}")



Total posts: 576958
Sparse co-occurrence matrix created.
Unique tags: 898
Matrix shape: (898, 898), Non-zero entries: 148308
Filtered 160 edges with weight >= 1000
Graph has 75 nodes and 80 edges.
Graph saved as tag_cooccurrence_network_full.html
