In [21]:
import json
import networkx as nx
import pandas as pd

In [22]:
with open("wiki_slice_subset_louvain.json", "r") as f:
    community_to_nodes = json.load(f)

community_to_nodes = {int(k): v for k, v in community_to_nodes.items()}

sorted_communities = sorted(community_to_nodes.items(), key=lambda x: len(x[1]))

smallest = sorted_communities[0]
largest = sorted_communities[-1]

print(f"Smallest community ID: {smallest[0]}, size: {len(smallest[1])}")
print(f"Largest community ID: {largest[0]}, size: {len(largest[1])}")

Smallest community ID: 19620506, size: 1
Largest community ID: 12, size: 1188812


In [23]:
df = pd.read_parquet("node_titles.parquet")
df.head(5)

Unnamed: 0,id,title
0,10,AccessibleComputing
1,12,Anarchism
2,13,AfghanistanHistory
3,14,AfghanistanGeography
4,15,AfghanistanPeople


In [24]:
label_dict = dict(zip(df["id"], df["title"]))

In [25]:
def get_labels(node_ids):
    return [label_dict.get(n, f"<no label for {n}>") for n in node_ids]

In [None]:
smallest_labels = get_labels(smallest[1])
largest_labels = get_labels(largest[1])

with open("smallest_community_labels.txt", "w") as f:
    f.write(f"# Smallest Community ID: {smallest[0]}, Size: {len(smallest[1])}\n")
    f.writelines(label + "\n" for label in smallest_labels)

with open("largest_community_labels.txt", "w") as f:
    f.write(f"# Largest Community ID: {largest[0]}, Size: {len(largest[1])}\n")
    f.writelines(label + "\n" for label in largest_labels)

In [None]:
# Get Top 5 Largest and Top 5 Smallest
second_largest = sorted_communities[-2]
second_smallest = sorted_communities[1]

third_largest = sorted_communities[-3]
third_smallest = sorted_communities[2]

fourth_largest = sorted_communities[-4]
fourth_smallest = sorted_communities[3]

fifth_largest = sorted_communities[-5]
fifth_smallest = sorted_communities[4]

# Get Labels for all communities
second_largest_labels = get_labels(second_largest[1])
second_smallest_labels = get_labels(second_smallest[1])

third_largest_labels = get_labels(third_largest[1])
third_smallest_labels = get_labels(third_smallest[1])

fourth_largest_labels = get_labels(fourth_largest[1])
fourth_smallest_labels = get_labels(fourth_smallest[1])

fifth_largest_labels = get_labels(fifth_largest[1])
fifth_smallest_labels = get_labels(fifth_smallest[1])

# Write to files
with open("second_largest_community_labels.txt", "w") as f:
    f.write(f"# Second Largest Community ID: {second_largest[0]}, Size: {len(second_largest[1])}\n")
    f.writelines(label + "\n" for label in second_largest_labels)
with open("second_smallest_community_labels.txt", "w") as f:
    f.write(f"# Second Smallest Community ID: {second_smallest[0]}, Size: {len(second_smallest[1])}\n")
    f.writelines(label + "\n" for label in second_smallest_labels)
with open("third_largest_community_labels.txt", "w") as f:
    f.write(f"# Third Largest Community ID: {third_largest[0]}, Size: {len(third_largest[1])}\n")
    f.writelines(label + "\n" for label in third_largest_labels)
with open("third_smallest_community_labels.txt", "w") as f:
    f.write(f"# Third Smallest Community ID: {third_smallest[0]}, Size: {len(third_smallest[1])}\n")
    f.writelines(label + "\n" for label in third_smallest_labels)
with open("fourth_largest_community_labels.txt", "w") as f:
    f.write(f"# Fourth Largest Community ID: {fourth_largest[0]}, Size: {len(fourth_largest[1])}\n")
    f.writelines(label + "\n" for label in fourth_largest_labels)
with open("fourth_smallest_community_labels.txt", "w") as f:
    f.write(f"# Fourth Smallest Community ID: {fourth_smallest[0]}, Size: {len(fourth_smallest[1])}\n")
    f.writelines(label + "\n" for label in fourth_smallest_labels)
with open("fifth_largest_community_labels.txt", "w") as f:
    f.write(f"# Fifth Largest Community ID: {fifth_largest[0]}, Size: {len(fifth_largest[1])}\n")
    f.writelines(str(label) + "\n" for label in fifth_largest_labels)
with open("fifth_smallest_community_labels.txt", "w") as f:
    f.write(f"# Fifth Smallest Community ID: {fifth_smallest[0]}, Size: {len(fifth_smallest[1])}\n")
    f.writelines(label + "\n" for label in fifth_smallest_labels)

In [None]:
# Print sizes of all ten communities
for community in sorted_communities:
    print(f"Community ID: {community[0]}, Size: {len(community[1])}")

Community ID: 19620506, Size: 1
Community ID: 25755457, Size: 2
Community ID: 27117702, Size: 2
Community ID: 32144289, Size: 2
Community ID: 37689005, Size: 2
Community ID: 38677429, Size: 2
Community ID: 43076577, Size: 2
Community ID: 43206360, Size: 2
Community ID: 46854771, Size: 2
Community ID: 48301763, Size: 2
Community ID: 39909968, Size: 3
Community ID: 979261, Size: 4
Community ID: 34164320, Size: 6
Community ID: 1006537, Size: 24
Community ID: 37775976, Size: 41
Community ID: 23969897, Size: 89
Community ID: 23942418, Size: 99
Community ID: 23942752, Size: 103
Community ID: 18150503, Size: 104
Community ID: 2679637, Size: 115
Community ID: 23935789, Size: 149
Community ID: 23939286, Size: 271
Community ID: 23589486, Size: 293
Community ID: 16435, Size: 3303
Community ID: 9493, Size: 4319
Community ID: 1413, Size: 25328
Community ID: 806, Size: 29362
Community ID: 1256, Size: 35781
Community ID: 655, Size: 36275
Community ID: 572, Size: 38506
Community ID: 790, Size: 43584
C

In [None]:
with open("smallest_community_labels.txt") as f:
    smallest_text = f.read()

with open("largest_community_labels.txt") as f:
    largest_text = f.read()

In [1]:
from collections import Counter

with open("largest_community_labels.txt") as f:
    labels = [line.strip() for line in f if not line.startswith("#")]
    labels = labels[:10000]

top_labels = Counter(labels).most_common(50)

with open("top_largest_labels.txt", "w") as f:
    for label, count in top_labels:
        f.write(f"{label}\n")

In [18]:
def reservoir_sample(file_path, k):
    reservoir = []
    n = 0  # count of valid lines seen so far

    with open(file_path) as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip()
            if n < k:
                reservoir.append(line)
            else:
                j = random.randint(0, n)
                if j < k:
                    reservoir[j] = line
            n += 1  # increment only on valid lines

    return reservoir

# Example usage
sampled_labels = reservoir_sample("fifth_largest_community_labels.txt", 100)

In [19]:
sampled_labels

['Recycled Orchestra of Cateura',
 'The Last Song (The All-American Rejects song)',
 'Die Miserable',
 'End the Silence',
 'To Sir, with Love (album)',
 'Markus Klinko',
 'Prinze (surname)',
 'A Sunny Day In Glasgow',
 'Red-Eyed Soul',
 'Million Stylez',
 'Dave Halili',
 'S.C.C. Presents Murder Squad Nationwide',
 'Morningtown Ride',
 'Come What May (2001 song)',
 'Every Little Thing (album)',
 'Open (Gotthard album)',
 'Wayne Krantz',
 "Summer of '81",
 'The Unblinking Eye (Everything Is Broken)',
 "Hailie's Song",
 'Cat In the Hat (1987 album)',
 'Auf der Flucht',
 'Bob Guccione, Jr.',
 'Washington Ballet',
 '100 Broken Windows',
 'HEARTSREVOLUTION',
 'FabricLive.68',
 'Soul Sista (song)',
 'Patricia Van Ness',
 'Broken Bones and Bloody Kisses',
 'Park (band)',
 'I Will Always Love You (Whitney Houston version)',
 'Bright Grey',
 'Get Down (Nas song)',
 'MegaForce',
 'Chic, Chic',
 'Backlash (Freddie Hubbard album)',
 'Simon (Soliko) Virsaladze',
 'Western Idaho Fair',
 'Christian me

In [20]:
with open("fifth_largest_labels.txt", "w") as f:
  for label in sampled_labels:
    f.write(label + "\n")