In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
import torch

from model import *

In [3]:
sns.set(style='darkgrid')
color = plt.rcParams['axes.prop_cycle'].by_key()['color']
np.random.seed(42)

# Load Data

In [4]:
# Load mentee survey data
survey_results = pd.read_csv('./data/survey_results.csv')
questions = survey_results.iloc[0]
survey_results = survey_results.iloc[2:]
id_questions = ['Q1', 'Q2']
numeric_questions = ['Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q17_1', 'Q17_2', 'Q17_3', 'Q17_4', 'Q18_2', 'Q18_3', 'Q18_4', 'Q18_5', 'Q18_6', 'Q18_1', 'Q19_1', 'Q19_2', 'Q19_3', 'Q19_4', 'Q19_5', 'Q19_6', 'Q20_1', 'Q20_2', 'Q20_3', 'Q20_4', 'Q21_1', 'Q21_2', 'Q21_3', 'Q21_4', 'Q21_5', 'Q22_1', 'Q22_2', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28_1', 'Q29', 'Q30', 'Q31', 'Q32']
# Eliminate empty names
for col in id_questions:
    survey_results = survey_results[survey_results[col].notna()]
# Clean strings
def clean(x):
    if pd.isnull(x):
        return x
    else:
        return int(''.join(s for s in x if s.isdigit()))
survey_results[numeric_questions] = survey_results[numeric_questions].applymap(clean)
# Fill missing results
knn = KNNImputer(n_neighbors=5)
survey_results[numeric_questions] = knn.fit_transform(survey_results[numeric_questions])
survey_results[numeric_questions] = survey_results[numeric_questions].astype(int)
# Standardize
survey_results[numeric_questions] = preprocessing.scale(survey_results[numeric_questions], axis=0)
# Finalize
survey_ids = np.array(survey_results[id_questions].applymap(lambda x: x.lower()))
survey_ids = np.array([' '.join(x) for x in survey_ids])
survey_ids = survey_ids.flatten()
survey_results = np.array(survey_results[numeric_questions])
# Remove duplicates by ids (might want to take last occurrence)
_, first_occurrences = np.unique(survey_ids, return_index=True)
mentee_survey_ids, mentee_survey_results = survey_ids[first_occurrences], survey_results[first_occurrences]

# Load mentor survey data
survey_results = pd.read_csv('./data/survey_results_mentors.csv')
questions = survey_results.iloc[0]
survey_results = survey_results.iloc[2:]
id_questions = ['email address']
# categorical_questions = ['Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q31', 'Q32', 'Q33']
numeric_questions = ['Q3_1', 'Q3_2', 'Q3_3', 'Q3_4', 'Q3_5', 'Q3_6', 'Q3_7', 'Q3_8', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q16_1', 'Q16_2', 'Q16_3', 'Q16_4', 'Q17_2', 'Q17_3', 'Q17_4', 'Q17_5', 'Q17_6', 'Q17_1', 'Q18_1', 'Q18_2', 'Q18_3', 'Q18_4', 'Q18_5', 'Q18_6', 'Q19_1', 'Q19_2', 'Q19_3', 'Q19_4', 'Q20_1', 'Q20_2', 'Q20_3', 'Q20_4', 'Q20_5', 'Q21_1', 'Q21_2', 'Q22', 'Q23', 'Q24', 'Q25']
# Eliminate empty names
for col in id_questions:
    survey_results = survey_results[survey_results[col].notna()]
# Clean strings
def clean(x):
    if pd.isnull(x):
        return x
    else:
        return int(''.join(s for s in x if s.isdigit()))
survey_results[numeric_questions] = survey_results[numeric_questions].applymap(clean)
# Fill missing results
knn = KNNImputer(n_neighbors=5)
survey_results[numeric_questions] = knn.fit_transform(survey_results[numeric_questions])
survey_results[numeric_questions] = survey_results[numeric_questions].astype(int)
# Standardize
survey_results[numeric_questions] = preprocessing.scale(survey_results[numeric_questions], axis=0)
# Finalize
survey_ids = np.array(survey_results[id_questions].applymap(lambda x: x.lower()))
survey_ids = np.array([' '.join(x) for x in survey_ids])
survey_ids = survey_ids.flatten()
survey_results = np.array(survey_results[numeric_questions])
# Remove duplicates by ids (might want to take last occurrence)
_, first_occurrences = np.unique(survey_ids, return_index=True)
mentor_survey_ids, mentor_survey_results = survey_ids[first_occurrences], survey_results[first_occurrences]

# Load match data
match_results = pd.read_csv('./data/matches.csv')
mentor_cols = [' Mentor Email']
mentee_cols = ['Mentee First Name', 'Mentee Last Name']
match_results_mentor = np.array(match_results[mentor_cols].applymap(lambda x: x.lower()))
match_results_mentor = np.array([' '.join(x) for x in match_results_mentor])
match_results_mentee = np.array(match_results[mentee_cols].applymap(lambda x: x.lower()))
match_results_mentee = np.array([' '.join(x) for x in match_results_mentee])
match_results = np.stack([match_results_mentee, match_results_mentor], axis=1)

# Take and filter to intersection of mentees
mentee_intersection = np.intersect1d(match_results[:, 0], mentee_survey_ids)
survey_idx = [n in mentee_intersection for n in mentee_survey_ids]
mentee_survey_ids = mentee_survey_ids[survey_idx]
mentee_survey_results = mentee_survey_results[survey_idx]
match_idx = [n in mentee_intersection for n in match_results[:, 0]]
match_results = match_results[match_idx, :]

# Take and filter to intersection of mentors
mentor_intersection = np.intersect1d(match_results[:, 1], mentor_survey_ids)
survey_idx = [n in mentor_intersection for n in mentor_survey_ids]
mentor_survey_ids = mentor_survey_ids[survey_idx]
mentor_survey_results = mentor_survey_results[survey_idx]
match_idx = [n in mentor_intersection for n in match_results[:, 1]]
match_results = match_results[match_idx, :]

# Take and filter to intersection of mentees (again, since mentor filtering could exclude a few)
mentee_intersection = np.intersect1d(match_results[:, 0], mentee_survey_ids)
survey_idx = [n in mentee_intersection for n in mentee_survey_ids]
mentee_survey_ids = mentee_survey_ids[survey_idx]
mentee_survey_results = mentee_survey_results[survey_idx]

# Unique mentors
mentors, match_results[:, 1] = np.unique(match_results[:, 1], return_inverse=True)
mentor_survey_ids = np.array([np.argwhere(n == mentors)[0] for n in mentor_survey_ids]).flatten()
# Unique mentees
mentees, match_results[:, 0] = np.unique(match_results[:, 0], return_inverse=True)
mentee_survey_ids = np.array([np.argwhere(n == mentees)[0] for n in mentee_survey_ids]).flatten()

# Formulate vars
matches = match_results.astype(int)
matches_outcome = np.ones(len(matches)).astype(float)  # Presence data
mentee_features = mentee_survey_results.astype(float)
mentee_features_ids = mentee_survey_ids
mentor_features = mentor_survey_results.astype(float)
mentor_features_ids = mentor_survey_ids
# mentees
# mentors

In [5]:
# Train-Test
train_frac = .8
mentees_train_idx = range(int(train_frac * len(mentees)))
mentees_test_idx = list(set(range(len(mentees))) - set(mentees_train_idx))
mentors_train_idx = range(int(train_frac * len(mentors)))
mentors_test_idx = list(set(range(len(mentors))) - set(mentors_train_idx))

# Reserve full
mentee_features_full = mentee_features
mentor_features_full = mentor_features
mentee_features_ids_full = mentee_features_ids
mentor_features_ids_full = mentor_features_ids
matches_full = matches
matches_outcome_full = matches_outcome

sort_idx = np.argsort(mentee_features_ids)  # TODO, Further shuffle matches
mentee_features, mentee_features_test = mentee_features[sort_idx][mentees_train_idx], mentee_features[sort_idx][mentees_test_idx]
mentee_features_ids, mentee_features_ids_test = mentee_features_ids[sort_idx][mentees_train_idx], mentee_features_ids[sort_idx][mentees_test_idx]

sort_idx = np.argsort(mentor_features_ids)  # TODO, Further shuffle matches
mentor_features, mentor_features_test = mentor_features[sort_idx][mentors_train_idx], mentor_features[sort_idx][mentors_test_idx]
mentor_features_ids, mentor_features_ids_test = mentor_features_ids[sort_idx][mentors_train_idx], mentor_features_ids[sort_idx][mentors_test_idx]

idx = [(mentee in mentee_features_ids and mentor in mentor_features_ids) for mentee, mentor in matches]
tidx = [(mentee in mentee_features_ids_test and mentor in mentor_features_ids_test) for mentee, mentor in matches]
midx = list(set(range(len(matches))) - set(idx) - set(tidx))
matches, matches_test, matches_mixed = matches[idx], matches[tidx], matches[midx]
matches_outcome, matches_outcome_test, matches_outcome_mixed = matches_outcome[idx], matches_outcome[tidx], matches_outcome[midx]
# Finalize dim variables
num_mentors = len(mentors_train_idx)
num_mentees = len(mentees_train_idx)
num_matches = len(matches)
num_mentees_features = mentee_features.shape[1]
num_mentors_features = mentor_features.shape[1]
num_mentors_test = len(mentors_test_idx)
num_mentees_test = len(mentors_test_idx)
num_matches_test = len(matches_test)

# Cast to type
matches = torch.Tensor(matches.astype(float)).int()
matches_test = torch.Tensor(matches_test.astype(float)).int()
matches_mixed = torch.Tensor(matches_mixed.astype(float)).int()
matches_outcome = torch.Tensor(matches_outcome).float()
matches_outcome_test = torch.Tensor(matches_outcome_test).float()
matches_outcome_mixed = torch.Tensor(matches_outcome_mixed).float()
mentee_features = torch.Tensor(mentee_features).float()
mentee_features_test = torch.Tensor(mentee_features_test).float()
mentee_features_full = torch.Tensor(mentee_features_full).float()
mentor_features = torch.Tensor(mentor_features).float()
mentor_features_test = torch.Tensor(mentor_features_test).float()
mentor_features_full = torch.Tensor(mentor_features_full).float()

# Common Latent Space Autoencoder KNN Model

In [6]:
# User Variables
dim_embedding = 7
epochs = 4001
batches = 20
lr = .0003
epoch_pd = 1000
batch_size = 32
batches = int(len(matches)/batch_size)

# Autoencoder
mentee_model = LatentModel(num_mentees_features, latent_dim=dim_embedding)
mentee_optimizer = torch.optim.AdamW(mentee_model.parameters(), lr=lr)
mentor_model = LatentModel(num_mentors_features, latent_dim=dim_embedding)
mentor_optimizer = torch.optim.AdamW(mentor_model.parameters(), lr=lr)
criterion = nn.MSELoss()

In [7]:
mentee_model.train()
mentor_model.train()
for epoch in range(epochs):
    for _ in range(batches):
        mentee_optimizer.zero_grad()
        mentor_optimizer.zero_grad()
        idx = np.random.choice(range(num_matches), batch_size, replace=False)
        input_matches = matches[idx]
        input_mentee = mentee_features[input_matches[:, 0].numpy(), :]
        input_mentor = mentor_features[input_matches[:, 1].numpy(), :]
        
        mentee_lat, mentee_reconst = mentee_model(input_mentee)
        mentee_loss = criterion(mentee_reconst, input_mentee)
        mentor_lat, mentor_reconst = mentor_model(input_mentor)
        mentor_loss = criterion(mentor_reconst, input_mentor)
        
        match_loss = .5 - .5 * sum(
            torch.sum(mentee_lat * mentor_lat, dim=1)
            / (torch.norm(mentee_lat, dim=1) * torch.norm(mentor_lat, dim=1))) / batch_size
        
        mentor_loss.backward(retain_graph=True)
        mentee_loss.backward(retain_graph=True)
        match_loss.backward()
        mentee_optimizer.step()
        mentor_optimizer.step()
        
    if epoch % epoch_pd == 0:
        print(f'Epoch: {epoch}', end=' \t')
        print(f'Mentee Loss: {float(mentee_loss)}', end=' \t')
        print(f'Mentor Loss: {float(mentor_loss)}', end=' \t')
        print(f'Match Loss: {float(match_loss)}')
mentee_model.eval()
mentor_model.eval();

Epoch: 0 	Mentee Loss: 1.9529311656951904 	Mentor Loss: 1.9996740818023682 	Match Loss: 0.5035293102264404
Epoch: 1000 	Mentee Loss: 0.10687097907066345 	Mentor Loss: 0.07742451876401901 	Match Loss: 0.03462797403335571
Epoch: 2000 	Mentee Loss: 0.08870913088321686 	Mentor Loss: 0.043789055198431015 	Match Loss: 0.022661268711090088
Epoch: 3000 	Mentee Loss: 0.05912439525127411 	Mentor Loss: 0.05966498330235481 	Match Loss: 0.013505339622497559
Epoch: 4000 	Mentee Loss: 0.0391707606613636 	Mentor Loss: 0.041679829359054565 	Match Loss: 0.007522225379943848


## Evaluation

In [12]:
k_val_frac = .1
# Train
k_val = int(k_val_frac * len(mentor_features_ids))
KNN = KNeighborsClassifier(n_neighbors=k_val)
feat = mentee_model(mentee_features)[0].detach().cpu().numpy()
feat_or = mentor_model(mentor_features)[0].detach().cpu().numpy()
labels = mentor_features_ids
KNN.fit(feat_or, labels)
# Predict
predicted = KNN.kneighbors(feat[matches[:, 0]])[1]
predicted = np.vectorize(lambda x: labels[int(x)])(predicted)
actual = matches[:, 1]
performance = sum([int(ex) in pr for ex, pr in zip(actual, predicted)]) / len(actual)
print(f'Train\nTop {k_val} Accuracy: {performance:.2f}\nBaseline: {k_val / len(labels):.2f}\n')

# Mixed
k_val = int(k_val_frac * len(mentor_features_ids_full))
KNN = KNeighborsClassifier(n_neighbors=k_val)
feat = mentee_model(mentee_features_full)[0].detach().cpu().numpy()
feat_or = mentor_model(mentor_features_full)[0].detach().cpu().numpy()
labels = mentor_features_ids_full
KNN.fit(feat_or, labels)
# Predict
predicted = KNN.kneighbors(feat[matches_mixed[:, 0]])[1]
predicted = np.vectorize(lambda x: labels[int(x)])(predicted)
actual = matches_mixed[:, 1]
performance = sum([int(ex) in pr for ex, pr in zip(actual, predicted)]) / len(actual)
print(f'Mixed\nTop {k_val} Accuracy: {performance:.2f}\nBaseline: {k_val / len(labels):.2f}\n')

# Full Mentors
k_val = int(k_val_frac * len(mentor_features_ids_full))
KNN = KNeighborsClassifier(n_neighbors=k_val)
feat = mentee_model(mentee_features)[0].detach().cpu().numpy()
feat_or = mentor_model(mentor_features_full)[0].detach().cpu().numpy()
labels = mentor_features_ids_full
KNN.fit(feat_or, labels)
# Predict
mat = matches_mixed[:, 0]
mat = mat[[(int(x) in mentee_features_ids) for x in mat]]
predicted = KNN.kneighbors(feat[mat])[1]
predicted = np.vectorize(lambda x: labels[int(x)])(predicted)
actual = matches_mixed[:, 1]
performance = sum([int(ex) in pr for ex, pr in zip(actual, predicted)]) / len(actual)
print(f'Mixed Mentors\nTop {k_val} Accuracy: {performance:.2f}\nBaseline: {k_val / len(labels):.2f}\n')

# Full Mentees
k_val = int(k_val_frac * len(mentor_features_ids))
KNN = KNeighborsClassifier(n_neighbors=k_val)
feat = mentee_model(mentee_features_full)[0].detach().cpu().numpy()
feat_or = mentor_model(mentor_features)[0].detach().cpu().numpy()
labels = mentor_features_ids
KNN.fit(feat_or, labels)
# Predict
predicted = KNN.kneighbors(feat[matches_mixed[:, 0]])[1]
predicted = np.vectorize(lambda x: labels[int(x)])(predicted)
actual = matches_mixed[:, 1]
performance = sum([int(ex) in pr for ex, pr in zip(actual, predicted)]) / len(actual)
print(f'Mixed Mentees\nTop {k_val} Accuracy: {performance:.2f}\nBaseline: {k_val / len(labels):.2f}\n')

# Test
k_val = int(k_val_frac * len(mentor_features_ids_test))
KNN = KNeighborsClassifier(n_neighbors=k_val)
feat = mentee_model(mentee_features_test)[0].detach().cpu().numpy()
feat_or = mentor_model(mentor_features_test)[0].detach().cpu().numpy()
labels = mentor_features_ids_test
KNN.fit(feat_or, labels)
# Predict
predicted = KNN.kneighbors(feat[matches_test[:, 0] - len(mentees_train_idx)])[1]
predicted = np.vectorize(lambda x: labels[int(x)])(predicted)
actual = matches_test[:, 1]
performance = sum([int(ex) in pr for ex, pr in zip(actual, predicted)]) / len(actual)
print(f'Test\nTop {k_val} Accuracy: {performance:.2f}\nBaseline: {k_val / len(labels):.2f}\n')

Train
Top 5 Accuracy: 0.98
Baseline: 0.10

Mixed
Top 6 Accuracy: 0.67
Baseline: 0.09

Mixed Mentors
Top 6 Accuracy: 0.10
Baseline: 0.09

Mixed Mentees
Top 5 Accuracy: 0.66
Baseline: 0.10

Test
Top 1 Accuracy: 0.00
Baseline: 0.07

