In [1]:
import numpy as np
import networkx as nx
import pandas as pd
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
G = nx.read_weighted_edgelist('male_graph.edgelist')
G.nodes

NodeView(('1', '2', '3', '0'))

In [3]:
clusters = pd.read_csv("male_clusters.csv")

In [4]:
clusters

Unnamed: 0,vertex_id,cluster_id
0,0,1
1,1,1
2,2,2
3,3,2


In [5]:
def compute_cluster_centroid(cluster: int):
    # Get the list of vertices in the given cluster
    vertices = clusters[clusters["cluster_id"] == cluster]["vertex_id"].astype(str).values
    distance_matrix = np.ndarray(shape=(len(vertices), len(vertices)))

    # Compute the distance matrix
    for i in range(len(vertices)):
        for j in range(len(vertices)):
            if i == j: 
                continue
            u = vertices[i]
            v = vertices[j]
            distance_matrix[i][j] = G[u][v]["weight"]
            distance_matrix[j][i] = G[u][v]["weight"]

    # Use MDS to get the positions of the vertices in a lower-dimensional space
    mds = MDS(dissimilarity="precomputed", n_components=distance_matrix.shape[0])
    positions = mds.fit_transform(distance_matrix)

    # Compute the centroid as the mean position of all vertices
    centroid = np.mean(positions, axis=0)

    # Compute the Euclidean distance from each vertex to the centroid
    distances = np.linalg.norm(positions - centroid, axis=1)

    # Find the index of the vertex closest to the centroid and return it
    centroid_index = np.argmin(distances)
    return vertices[centroid_index]

In [6]:
clusters_list = clusters["cluster_id"].unique().tolist()
clusters_list

[1, 2]

In [7]:
cluster_to_centroid: dict = {cluster: compute_cluster_centroid(cluster) for cluster in clusters_list}
centroid_to_cluster: dict = {centroid: cluster for cluster, centroid in cluster_to_centroid.items()}
cluster_to_centroid

{1: '0', 2: '2'}

In [8]:
import pickle

with open("../data/features", "rb") as f:
    encoded_features = pickle.load(f)

data = pd.read_csv("../data/preprocessed_data.csv")

  encoded_features = pickle.load(f)


In [9]:
centroids_vec = [encoded_features[int(c)] for c in cluster_to_centroid.values()]

In [40]:
def match_female_with_males(female_id: int):
    female_vec = encoded_features[female_id].reshape(1, -1)
    distances = [cosine_similarity(female_vec, centroids_vec[i].reshape(1, -1))[0][0] for i in range(len(centroids_vec))]
    closest_cluster = np.argmin(distances) + 1
    return clusters[clusters["cluster_id"] == closest_cluster]["vertex_id"].astype(str).values