In [1]:
!pip install torch_geometric



In [2]:
import community as community_louvain  # Louvain method
from networkx.algorithms.community import label_propagation_communities
import networkx as nx
import pickle
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv
import torch
import matplotlib.pyplot as plt


In [4]:
# Load the embeddings for male and female essays
with open("../embedders/embeddings/embeddings_male.obj", "rb") as f:
    embeddings_male = pickle.load(f)

with open("../embedders/embeddings/embeddings_female.obj", "rb") as f:
    embeddings_female = pickle.load(f)

In [5]:
# Function to calculate cosine similarity for two embeddings, returns 0 if either is None
def safe_cosine_similarity(embedding1, embedding2):
    if np.isnan(embedding1[0][0]) or np.isnan(embedding2[0][0]):
        return -0.01
    return cosine_similarity(embedding1, embedding2)[0][0]


In [8]:
from tqdm import tqdm
def build_graph(female_indices, male_indices):
    G = nx.Graph()

    # Add female nodes to the graph
    for female_idx in female_indices:
        G.add_node(female_idx, bipartite=0, type='female')

    # Add male nodes to the graph
    for male_idx in male_indices:
        G.add_node(male_idx, bipartite=1, type='male')

    # Add edges based on essay similarities
    for female_idx in tqdm(female_indices):
        for male_idx in male_indices:
            total_similarity = 0
            valid_essays = 0
            for j in range(10):  # Iterate over all essays (0 to 9)
                essay = f"essay{j}"
                female_embedding = embeddings_female[essay][female_idx].reshape(1, -1)
                male_embedding = embeddings_male[essay][male_idx].reshape(1, -1)
                similarity = safe_cosine_similarity(female_embedding, male_embedding)
                if similarity > 0:
                    total_similarity += similarity
                    valid_essays += 1

            edge_weight = total_similarity / valid_essays if valid_essays > 0 else 0

            # Add edge with weight
            if edge_weight > 0:
                G.add_edge(female_idx, male_idx, weight=edge_weight)

    return G

In [11]:
N = 150
female_indices = list(range(N))
male_indices = list(range(N))

# Build the graph
G = build_graph(female_indices, male_indices)

# Convert NetworkX graph to PyTorch Geometric Data
def nx_to_pyg_data(G):
    # Get edge list and weights
    edge_index = torch.tensor(list(G.edges), dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor([G[u][v]['weight'] for u, v in G.edges], dtype=torch.float)
    # Create feature matrix (one-hot encoding for node types)
    node_features = []
    for node, data in tqdm(G.nodes(data=True)):
        if data['type'] == 'female':
            node_features.append([1, 0])
        else:
            node_features.append([0, 1])
    x = torch.tensor(node_features, dtype=torch.float)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_weight)

# Convert graph to PyTorch Geometric format
data = nx_to_pyg_data(G)

100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [00:58<00:00,  2.56it/s]
100%|████████████████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 147030.99it/s]


In [14]:
# Clustering and evaluation
kmeans = KMeans(n_clusters=12)

# GCN clustering
kmeans.fit(gcn_embeddings)
gcn_labels = kmeans.labels_
silhouette_gcn = silhouette_score(gcn_embeddings, gcn_labels)

# GraphSAGE clustering
kmeans.fit(graphsage_embeddings)
graphsage_labels = kmeans.labels_
silhouette_graphsage = silhouette_score(graphsage_embeddings, graphsage_labels)

# --- Louvain Method ---
partition = community_louvain.best_partition(G, weight='weight')
louvain_communities = {node: comm for node, comm in partition.items()}
modularity_louvain = nx.algorithms.community.modularity(G, [set(np.where(np.array(list(partition.values())) == i)[0]) for i in set(partition.values())])

# --- Label Propagation ---
label_prop_communities = list(label_propagation_communities(G))
label_prop_dict = {node: idx for idx, community in enumerate(label_prop_communities) for node in community}
modularity_label_prop = nx.algorithms.community.modularity(G, label_prop_communities)

# --- Modularity for GCN and GraphSAGE ---
# Communities for GCN
gcn_communities = {i: gcn_labels[i] for i in range(len(gcn_labels))}
modularity_gcn = nx.algorithms.community.modularity(G, [set(np.where(np.array(gcn_labels) == i)[0]) for i in set(gcn_labels)])

# Communities for GraphSAGE
graphsage_communities = {i: graphsage_labels[i] for i in range(len(graphsage_labels))}
modularity_graphsage = nx.algorithms.community.modularity(G, [set(np.where(np.array(graphsage_labels) == i)[0]) for i in set(graphsage_labels)])

# --- Print Results ---
print(f"Modularity (Louvain): {modularity_louvain:.4f}")
print(f"Modularity (Label Propagation): {modularity_label_prop:.4f}")
print(f"Modularity (GCN): {modularity_gcn:.4f}")
print(f"Modularity (GraphSAGE): {modularity_graphsage:.4f}")
print(f"Silhouette Score (GCN): {silhouette_gcn:.4f}")
print(f"Silhouette Score (GraphSAGE): {silhouette_graphsage:.4f}")

# --- Return Results ---
results = {
    'Louvain': louvain_communities,
    'Label Propagation': label_prop_dict,
    'GCN': gcn_communities,
    'GraphSAGE': graphsage_communities
}
print("\nGraph Partition Results:")
for method, clusters in results.items():
    print(f"{method}: {clusters}")

AttributeError: module 'community' has no attribute 'best_partition'