In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import networkx as nx

In [2]:
data = pd.read_csv("../data/preprocessed_data.csv")

In [3]:
male_data = data[data["sex"] == "m"]
female_data = data[data["sex"] == "f"]

In [4]:
# Clustering males and females
def cluster_users(males_features, females_features, num_clusters=10):
    """
    Clusters both males and females separately and returns the cluster labels for each.
    """
    # Clustering the males
    males_kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    males_clusters = males_kmeans.fit_predict(males_features)
    
    # Clustering the females
    females_kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    females_clusters = females_kmeans.fit_predict(females_features)
    
    return males_clusters, females_clusters, males_kmeans, females_kmeans

# Function to find the N nearest males for a given female user
def find_nearest_males(female_idx, females_features, males_features, females_clusters, males_clusters, N=100):
    """
    Given the index of a female, returns the N nearest males from the same cluster.
    """
    # Get the cluster of the given female
    female_cluster = females_clusters[female_idx]
    
    # Find all males in the same cluster
    males_in_same_cluster = np.where(males_clusters == female_cluster)[0]
    
    # Get the feature vector of the female
    female_features = females_features[female_idx].reshape(1, -1)
    
    # Calculate the cosine similarity between the female and each male in the same cluster
    similarities = []
    for male_idx in males_in_same_cluster:
        male_features = males_features[male_idx].reshape(1, -1)
        similarity = cosine_similarity(female_features, male_features)[0][0]
        similarities.append((male_idx, similarity))
    
    # Sort by similarity and return the top N males
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_n_males = [male_idx for male_idx, _ in similarities[:N]]
    
    return top_n_males

In [5]:
with open("../data/females_features", "rb") as f:
    females_features = pickle.load(f)

with open("../data/males_features", "rb") as f:
    males_features = pickle.load(f)

females_features.shape, males_features.shape    

((20533, 261), (31073, 261))

In [6]:
N: int = 500

# Cluster males and females separately
males_clusters, females_clusters, males_kmeans, females_kmeans = cluster_users(males_features, females_features, num_clusters=10)

# Let's say you want to find the top 5 nearest males for a female at index 10
female_idx = 238
top_N_males = find_nearest_males(female_idx, females_features, males_features, females_clusters, males_clusters, N=N)

### Let's have a brief glance at the result

In [7]:
female_data.iloc[female_idx]

age                                                           37
status                                                    single
sex                                                            f
orientation                                             straight
body_type                                              endomorph
drinks                                                  socially
education                                        College or more
height                                                     173.0
job                                            medicine / health
location                                     oakland, california
religion                                             agnosticism
smokes                                                        no
essay0         i am a leader, reader and eater. i'm the glue ...
essay1         volunteering, i recruit people to do it for wo...
essay2         planning a party and executing it. i know more...
essay3         my laugh, 

In [8]:
male_data.iloc[top_N_males]

Unnamed: 0,age,status,sex,orientation,body_type,drinks,education,height,job,location,...,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,likes_dogs,likes_cats
15067,50,single,m,straight,endomorph,socially,College or more,183.0,other,"oakland, california",...,"showering attention, making people laugh, play...",my sense of humor and ability to put people at...,"food: sushi music: jazz fusion, especially ret...","my friends, my cat, my computer, my drumkit/mu...",nothing in particular. i don't obsess over any...,rehearsing with the band in our studio in west...,i derive great satisfaction in satisfying and ...,"you want to share some time with a funny, inte...",Yes,Yes
47190,49,single,m,straight,endomorph,socially,Some college,183.0,political / government,"oakland, california",...,this site is like filling an application.askin...,tall.if you have pics on this site people usua...,all the above except books its not me.maybe la...,life.. loving some one forever i mean really b...,,working but ready for saturday.ready for anyth...,,,Yes,Yes
37770,54,single,m,straight,mesomorph,socially,College or more,185.0,artistic / musical / writer,"oakland, california",...,i'm very adept at creating art and music of al...,i'm tall and maybe sometimes too forward as fa...,too many answers to this question so i'll just...,food music sex trees coffee water,different ways to reinvent what people come to...,don't usually even know when it is friday night.,obviously it's private.,you are into solving crimes and developing ope...,Yes,Yes
47808,50,single,m,straight,mesomorph,socially,College or more,183.0,science / tech / engineering,"oakland, california",...,"computational linguistics, deconstructing shor...",,"i read novels mostly, but there's the occasion...",swimming you reading fat boy fountain pen with...,verisimilitude. semiotics.,writing.,"if you ask, i'll admit it.","you're smart, and sexy, and you read, and you ...",Yes,Yes
47654,54,single,m,straight,mesomorph,socially,College or more,188.0,medicine / health,"san bruno, california",...,giving massage! listening - and being inquisit...,my blue eyes or my smile. i'm a tall drink of ...,"books - the stand, the prophet, chronicles of ...","love, family, intuition, empathy, touch, and l...","my son and his development, my interactions wi...","sometimes tired after a hectic week, or wantin...",i need a very patient dance instructor,if you are looking for a sensitive man who is ...,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30166,58,single,m,straight,endomorph,not at all,Some college,178.0,other,"mountain view, california",...,"staying calm when things get crazy, being pret...",my smile and positive attitude. that i am a go...,"i am not a big reader, but i do like watching ...","honestly, there are not many things that i cou...",starting a new enterprise. how i can become a ...,doing what i do on friday nights.,will be revealed when we meet.,you would like to know more.,Yes,Yes
18599,54,single,m,straight,mesomorph,socially,High school or less,178.0,transportation,"san leandro, california",...,keeping an open mind to doing new things that ...,my big brown eyes,books just started reading again so when i get...,love romance a soul mate who is my best friend...,is there that one that wants to share there li...,working so its sat night for me,i will save that answer for the one i meet,if you want to talk and get to know each other...,Yes,Yes
15473,54,single,m,straight,mesomorph,socially,College or more,183.0,computer / hardware / software,"palo alto, california",...,putting the seat down. critical thinking. open...,my soulful blue eyes? my rakish good looks? my...,okcupid question: shaved monkey because this i...,"my two grown boys, but they're not things, the...",learning to speak japanese. they have a differ...,"slaying dragons, typically.","when i'm sauteing mushrooms, i like to put a l...",you understand that your my self-summary secti...,Yes,No
22619,54,available,m,straight,mesomorph,socially,College or more,183.0,executive / management,"walnut creek, california",...,stimulating smart fun people :),my eyes and i am told my calves (don't get tha...,"big bang theory, hawaii five 0 for tv. food wi...",i am a techy at heart: computer sex toys gps ...,"how to improve the world and get ahead, capita...",out to dinner...,i am a little kinky,"if you want to :) and you are smart, sexy, dis...",Yes,No


# The matches look great!

#### Now build a graph based on the essays and run some computationally complex algorithm

In [9]:
# Load the essay embeddings from the file
with open("../embedders/embeddings/embeddings_male.obj", "rb") as f:
    embeddings_male = pickle.load(f)

with open("../embedders/embeddings/embeddings_female.obj", "rb") as f:
    embeddings_female = pickle.load(f)

In [10]:
# Function to calculate cosine similarity for two embeddings, returns 0 if either is None
def safe_cosine_similarity(embedding1, embedding2):
    if np.isnan(embedding1[0][0]) or np.isnan(embedding2[0][0]):
        return -0.01
    return cosine_similarity(embedding1, embedding2)[0][0]


# Build the bipartite graph for one female and N closest males
def build_graph(female_idx, closest_males):

    # Initialize the graph
    G = nx.Graph()
    
    # Add female node to the graph
    G.add_node(female_idx, bipartite=0, type='female')
    
    # Add male nodes to the graph and create edges based on essay similarities
    for male_idx in closest_males:
        G.add_node(male_idx, bipartite=1, type='male')
        
        # Calculate the edge weights as the average cosine similarity between essay embeddings
        total_similarity = 0
        valid_essays = 0
        for j in range(10):  # Iterate over all essays (0 to 9)
            essay = f"essay{j}"
            female_embedding = embeddings_female[essay][female_idx].reshape(1, -1)
            male_embedding = embeddings_male[essay][male_idx].reshape(1, -1)
            similarity = safe_cosine_similarity(female_embedding, male_embedding)
            if similarity > 0:
                total_similarity += similarity
                valid_essays += 1
        
        # If we found valid essays, average the similarity; otherwise, set to 0
        edge_weight = total_similarity / valid_essays if valid_essays > 0 else 0
        
        # Add edge between female and male with the computed similarity as weight
        G.add_edge(female_idx, male_idx, weight=edge_weight)
    
    return G

G = build_graph(female_idx=female_idx, closest_males=top_N_males)

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


class RankingDataset(Dataset):
    def __init__(self, female_features, male_features, edge_weights):
        self.female_features = female_features
        self.male_features = male_features
        self.edge_weights = edge_weights

    def __len__(self):
        return len(self.edge_weights)

    def __getitem__(self, idx):
        return {
            'female': torch.tensor(self.female_features[idx], dtype=torch.float32),
            'male': torch.tensor(self.male_features[idx], dtype=torch.float32),
            'weight': torch.tensor(self.edge_weights[idx], dtype=torch.float32)
        }


class RankingNN(nn.Module):
    def __init__(self, female_input_dim, male_input_dim):
        super(RankingNN, self).__init__()
        self.fc1 = nn.Linear(female_input_dim + male_input_dim, 256)
        self.fc2 = nn.Linear(256, 64)
        self.fc3 = nn.Linear(64, 1)  # Output a score for ranking

    def forward(self, female_features, male_features):
        # Concatenate female and male features
        x = torch.cat([female_features, male_features], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Prepare data for the neural network
female_features_all = []  # List of female features
male_features_all = []    # List of male features
edge_weights_all = []     # List of edge weights (similarity scores)

for female_idx, male_idx in G.edges():
    female_features_all.append(females_features[female_idx])
    male_features_all.append(males_features[male_idx])
    edge_weights_all.append(G[female_idx][male_idx]['weight'])


dataset = RankingDataset(female_features_all, male_features_all, edge_weights_all)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, loss function, and optimizer
model = RankingNN(female_input_dim=females_features.shape[1], male_input_dim=males_features.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()

        female_features = batch['female']
        male_features = batch['male']
        edge_weights = batch['weight']

        predictions = model(female_features, male_features)
        loss = criterion(predictions.squeeze(), edge_weights)

        loss.backward()
        optimizer.step()

        if loss.item() <= 0.001:
            break

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

# The model is now trained and you can use it to predict rankings for new females


Epoch 1/50, Loss: 0.0045
Epoch 2/50, Loss: 0.0014
Epoch 3/50, Loss: 0.0056
Epoch 4/50, Loss: 0.0031
Epoch 5/50, Loss: 0.0009
Epoch 6/50, Loss: 0.0041
Epoch 7/50, Loss: 0.0062
Epoch 8/50, Loss: 0.0009
Epoch 9/50, Loss: 0.0026
Epoch 10/50, Loss: 0.0026
Epoch 11/50, Loss: 0.0007
Epoch 12/50, Loss: 0.0009
Epoch 13/50, Loss: 0.0010
Epoch 14/50, Loss: 0.0009
Epoch 15/50, Loss: 0.0010
Epoch 16/50, Loss: 0.0009
Epoch 17/50, Loss: 0.0028
Epoch 18/50, Loss: 0.0009
Epoch 19/50, Loss: 0.0053
Epoch 20/50, Loss: 0.0006
Epoch 21/50, Loss: 0.0009
Epoch 22/50, Loss: 0.0006
Epoch 23/50, Loss: 0.0006
Epoch 24/50, Loss: 0.0008
Epoch 25/50, Loss: 0.0004
Epoch 26/50, Loss: 0.0008
Epoch 27/50, Loss: 0.0004
Epoch 28/50, Loss: 0.0006
Epoch 29/50, Loss: 0.0007
Epoch 30/50, Loss: 0.0005
Epoch 31/50, Loss: 0.0005
Epoch 32/50, Loss: 0.0005
Epoch 33/50, Loss: 0.0007
Epoch 34/50, Loss: 0.0006
Epoch 35/50, Loss: 0.0008
Epoch 36/50, Loss: 0.0005
Epoch 37/50, Loss: 0.0005
Epoch 38/50, Loss: 0.0005
Epoch 39/50, Loss: 0.

## Make predictions!

In [12]:
# Get the model's predictions for the new female
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    predictions = []
    for batch in data_loader:
        female_features = batch['female']
        male_features = batch['male']
        
        # Get predicted rankings (scores)
        predicted_scores = model(female_features, male_features).squeeze().numpy()
        predictions.extend(predicted_scores)

In [13]:
predictions_indexed = [(x, i) for i, x in enumerate(predictions)]
predictions_indexed.sort(key=lambda x: -x[0])

In [14]:
best_match = predictions_indexed[0][1]
print("The best match ID is:", best_match)

The best match ID is: 181


In [15]:
essays_male = [male_data.iloc[best_match][f"essay{i}"] for i in range(10)]
essays_female = [female_data.iloc[female_idx][f"essay{i}"] for i in range(10)]

print("MALE ESSAYS:")
for e in essays_male:
    if e is not np.nan:
        print(e)
print("=================================")
print("FEMALE ESSAYS:")
for e in essays_female:
    if e is not np.nan:
        print(e)

MALE ESSAYS:
i'm very honest & straightforward, both with you & with me, so i tend to know what i'm about and i don't mind telling you how i'm feeling. i like people who are communicative & functional - i don't want to have to guess what's wrong, i want you to tell me, and if you're happy i'd like to hear about it. i'm easy to get along with, probably largely because i'm not very demanding and i am very forgiving - i'm appreciative of the things that work out between us and i understand that sometimes things just don't quite go as planned. so you're late, or you have to cancel something, or you're grumpy in the mornings - so what? i take everyone with a grain of salt, and i don't tend to take anything too seriously.  i like to see people i care about as often as i can, but whether that translates to once a month or twice a week is fine. some people are busy, some aren't - live your life how you want to, and i'll be happy if we can do so together. i might really love spending time with 