In [1]:
import pandas as pd
from collections import Counter, defaultdict
import itertools
import networkx as nx
import community  # pip install python-louvain

CSV_PATH = "/home/ofrie.r/graph-ncp-project/results/snap+lgc/facebook_pages/combined_min_phi.csv"

In [None]:

TOP_N = 500   # number of top nodes you care about

# 1. Load CSV
df = pd.read_csv(CSV_PATH)
df["nodes_list"] = df["nodes"].apply(lambda x: [int(i) for i in str(x).split()])

# 2. Count node frequencies
counter = Counter()
for nodes in df["nodes_list"]:
    counter.update(nodes)

# 3. Take top N nodes only
top_nodes = [node for node, freq in counter.most_common(TOP_N)]
top_node_set = set(top_nodes)

# 4. Count co-occurrences among top nodes
cooc = defaultdict(Counter)

for nodes in df["nodes_list"]:
    # keep only top nodes
    nodes_top = [n for n in nodes if n in top_node_set]
    for a, b in itertools.combinations(nodes_top, 2):
        cooc[a][b] += 1
        cooc[b][a] += 1

# 5. Build graph for Louvain clustering
G = nx.Graph()
for a in cooc:
    for b, w in cooc[a].items():
        G.add_edge(a, b, weight=w)

# 6. Louvain clustering
partition = community.best_partition(G, weight="weight")

# 7. Print results
print("\n=== TOP NODE FREQUENCIES ===")
for node, freq in counter.most_common(TOP_N):
    print(f"Node {node}: {freq} appearances")

print("\n=== CLUSTER ASSIGNMENTS FOR TOP NODES ===")
for node, cluster in partition.items():
    print(f"Node {node}: cluster {cluster}")



=== TOP NODE FREQUENCIES ===
Node 9246: 8530 appearances
Node 11114: 8347 appearances
Node 3130: 8341 appearances
Node 12696: 8281 appearances
Node 3702: 8266 appearances
Node 1412: 8232 appearances
Node 19916: 8223 appearances
Node 11938: 8146 appearances
Node 1489: 8110 appearances
Node 12442: 8108 appearances
Node 9968: 8096 appearances
Node 17738: 8089 appearances
Node 4810: 8075 appearances
Node 4816: 8074 appearances
Node 19977: 8074 appearances
Node 13266: 8074 appearances
Node 19896: 8074 appearances
Node 4052: 8074 appearances
Node 8029: 8074 appearances
Node 11382: 8073 appearances
Node 7891: 8073 appearances
Node 15977: 8073 appearances
Node 20512: 8073 appearances
Node 21784: 8073 appearances
Node 871: 8073 appearances
Node 10244: 8073 appearances
Node 17212: 8072 appearances
Node 6356: 8072 appearances
Node 5306: 8072 appearances
Node 14634: 8072 appearances
Node 19556: 8072 appearances
Node 18273: 8072 appearances
Node 6784: 8072 appearances
Node 221: 8072 appearances
No

In [None]:
(51,100),(101,500),(501,1000),(1001,5000)

In [10]:
import pandas as pd

# --- Load CSV ---
df = pd.read_csv(CSV_PATH)



# Step 1: Find the cluster with lowest phi
worst_row = df.loc[df['phi'].idxmin()]
print("Worst cluster details:")
print(worst_row["size"])
worst_nodes = set(worst_row['nodes'].split())

# Step 2: Define your bins (adjust ranges as needed)
bins = [(1, 50), (51,100),(101,500),(501,1000),(1001,5000),(5001, float('inf'))]

def get_bin(size):
    for b in bins:
        if b[0] <= size <= b[1]:
            return f"{b[0]}-{int(b[1]) if b[1] != float('inf') else 'inf'}"
    return "unknown"

# Step 3: Track appearances of worst nodes in other clusters
node_bins = {node: set() for node in worst_nodes}

for _, row in df.iterrows():
    nodes = set(row['nodes'].split())
    cluster_size = int(row['size'])
    cluster_bin = get_bin(cluster_size)
    
    for node in worst_nodes & nodes:  # only care about worst nodes
        node_bins[node].add(cluster_bin)

# Step 4: Convert to DataFrame for presentation
presentation_df = pd.DataFrame([
    {"node": node, "bins": sorted(list(bins_appeared))}
    for node, bins_appeared in node_bins.items()
])

print(presentation_df)


Worst cluster details:
35.0
     node              bins
0   16136            [1-50]
1   13114            [1-50]
2   20564            [1-50]
3   17765  [1-50, 5001-inf]
4   15498  [1-50, 5001-inf]
5    7905            [1-50]
6   16159            [1-50]
7   10519            [1-50]
8   14909            [1-50]
9    2703  [1-50, 5001-inf]
10  16273            [1-50]
11  15392  [1-50, 5001-inf]
12   8667            [1-50]
13   9391  [1-50, 5001-inf]
14   4541            [1-50]
15  12850            [1-50]
16  11342            [1-50]
17  10430            [1-50]
18   2061  [1-50, 5001-inf]
19   3333            [1-50]
20  10821  [1-50, 5001-inf]
21  13248            [1-50]
22  15774  [1-50, 5001-inf]
23  18243  [1-50, 5001-inf]
24    168  [1-50, 5001-inf]
25    610            [1-50]
26   4778  [1-50, 5001-inf]
27   1674            [1-50]
28   2091            [1-50]
29   5334            [1-50]
30  16682            [1-50]
31  17627  [1-50, 5001-inf]
32  10852            [1-50]
33  21394           

In [9]:
worst_row["size"]

np.float64(35.0)

In [11]:
worst_row

size                                                   35.0
phi                                                0.010256
nodes     2061 2703 168 4778 17765 9391 18243 15392 1577...
source                                                  lgc
Name: 34, dtype: object