In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
import torch

from model import *

In [3]:
sns.set(style='darkgrid')
color = plt.rcParams['axes.prop_cycle'].by_key()['color']
np.random.seed(42)

# Load Data

In [4]:
# Load mentee survey data
survey_results = pd.read_csv('./data/survey_results.csv')
questions = survey_results.iloc[0]
survey_results = survey_results.iloc[2:]
id_questions = ['Q1', 'Q2']
numeric_questions = ['Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q17_1', 'Q17_2', 'Q17_3', 'Q17_4', 'Q18_2', 'Q18_3', 'Q18_4', 'Q18_5', 'Q18_6', 'Q18_1', 'Q19_1', 'Q19_2', 'Q19_3', 'Q19_4', 'Q19_5', 'Q19_6', 'Q20_1', 'Q20_2', 'Q20_3', 'Q20_4', 'Q21_1', 'Q21_2', 'Q21_3', 'Q21_4', 'Q21_5', 'Q22_1', 'Q22_2', 'Q23', 'Q24', 'Q25', 'Q26', 'Q27', 'Q28_1', 'Q29', 'Q30', 'Q31', 'Q32']
# Eliminate empty names
for col in id_questions:
    survey_results = survey_results[survey_results[col].notna()]
# Clean strings
def clean(x):
    if pd.isnull(x):
        return x
    else:
        return int(''.join(s for s in x if s.isdigit()))
survey_results[numeric_questions] = survey_results[numeric_questions].applymap(clean)
# Fill missing results
knn = KNNImputer(n_neighbors=5)
survey_results[numeric_questions] = knn.fit_transform(survey_results[numeric_questions])
survey_results[numeric_questions] = survey_results[numeric_questions].astype(int)
# Standardize
survey_results[numeric_questions] = preprocessing.scale(survey_results[numeric_questions], axis=0)
# Finalize
survey_ids = np.array(survey_results[id_questions].applymap(lambda x: x.lower()))
survey_ids = np.array([' '.join(x) for x in survey_ids])
survey_ids = survey_ids.flatten()
survey_results = np.array(survey_results[numeric_questions])
# Remove duplicates by ids (might want to take last occurrence)
_, first_occurrences = np.unique(survey_ids, return_index=True)
mentee_survey_ids, mentee_survey_results = survey_ids[first_occurrences], survey_results[first_occurrences]

# Load mentor survey data
survey_results = pd.read_csv('./data/survey_results_mentors.csv')
questions = survey_results.iloc[0]
survey_results = survey_results.iloc[2:]
id_questions = ['email address']
# categorical_questions = ['Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q31', 'Q32', 'Q33']
numeric_questions = ['Q3_1', 'Q3_2', 'Q3_3', 'Q3_4', 'Q3_5', 'Q3_6', 'Q3_7', 'Q3_8', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13', 'Q14', 'Q16_1', 'Q16_2', 'Q16_3', 'Q16_4', 'Q17_2', 'Q17_3', 'Q17_4', 'Q17_5', 'Q17_6', 'Q17_1', 'Q18_1', 'Q18_2', 'Q18_3', 'Q18_4', 'Q18_5', 'Q18_6', 'Q19_1', 'Q19_2', 'Q19_3', 'Q19_4', 'Q20_1', 'Q20_2', 'Q20_3', 'Q20_4', 'Q20_5', 'Q21_1', 'Q21_2', 'Q22', 'Q23', 'Q24', 'Q25']
# Eliminate empty names
for col in id_questions:
    survey_results = survey_results[survey_results[col].notna()]
# Clean strings
def clean(x):
    if pd.isnull(x):
        return x
    else:
        return int(''.join(s for s in x if s.isdigit()))
survey_results[numeric_questions] = survey_results[numeric_questions].applymap(clean)
# Fill missing results
knn = KNNImputer(n_neighbors=5)
survey_results[numeric_questions] = knn.fit_transform(survey_results[numeric_questions])
survey_results[numeric_questions] = survey_results[numeric_questions].astype(int)
# Standardize
survey_results[numeric_questions] = preprocessing.scale(survey_results[numeric_questions], axis=0)
# Finalize
survey_ids = np.array(survey_results[id_questions].applymap(lambda x: x.lower()))
survey_ids = np.array([' '.join(x) for x in survey_ids])
survey_ids = survey_ids.flatten()
survey_results = np.array(survey_results[numeric_questions])
# Remove duplicates by ids (might want to take last occurrence)
_, first_occurrences = np.unique(survey_ids, return_index=True)
mentor_survey_ids, mentor_survey_results = survey_ids[first_occurrences], survey_results[first_occurrences]

# Load match data
match_results = pd.read_csv('./data/matches.csv')
mentor_cols = [' Mentor Email']
mentee_cols = ['Mentee First Name', 'Mentee Last Name']
match_results_mentor = np.array(match_results[mentor_cols].applymap(lambda x: x.lower()))
match_results_mentor = np.array([' '.join(x) for x in match_results_mentor])
match_results_mentee = np.array(match_results[mentee_cols].applymap(lambda x: x.lower()))
match_results_mentee = np.array([' '.join(x) for x in match_results_mentee])
match_results = np.stack([match_results_mentee, match_results_mentor], axis=1)

# Take and filter to intersection of mentees
mentee_intersection = np.intersect1d(match_results[:, 0], mentee_survey_ids)
survey_idx = [n in mentee_intersection for n in mentee_survey_ids]
mentee_survey_ids = mentee_survey_ids[survey_idx]
mentee_survey_results = mentee_survey_results[survey_idx]
match_idx = [n in mentee_intersection for n in match_results[:, 0]]
match_results = match_results[match_idx, :]

# Take and filter to intersection of mentors
mentor_intersection = np.intersect1d(match_results[:, 1], mentor_survey_ids)
survey_idx = [n in mentor_intersection for n in mentor_survey_ids]
mentor_survey_ids = mentor_survey_ids[survey_idx]
mentor_survey_results = mentor_survey_results[survey_idx]
match_idx = [n in mentor_intersection for n in match_results[:, 1]]
match_results = match_results[match_idx, :]

# Take and filter to intersection of mentees (again, since mentor filtering could exclude a few)
mentee_intersection = np.intersect1d(match_results[:, 0], mentee_survey_ids)
survey_idx = [n in mentee_intersection for n in mentee_survey_ids]
mentee_survey_ids = mentee_survey_ids[survey_idx]
mentee_survey_results = mentee_survey_results[survey_idx]

# Unique mentors
mentors, match_results[:, 1] = np.unique(match_results[:, 1], return_inverse=True)
mentor_survey_ids = np.array([np.argwhere(n == mentors)[0] for n in mentor_survey_ids]).flatten()
# Unique mentees
mentees, match_results[:, 0] = np.unique(match_results[:, 0], return_inverse=True)
mentee_survey_ids = np.array([np.argwhere(n == mentees)[0] for n in mentee_survey_ids]).flatten()

# Formulate vars
matches = match_results.astype(int)
matches_outcome = np.ones(len(matches)).astype(float)  # Presence data
mentee_features = mentee_survey_results.astype(float)
mentee_features_ids = mentee_survey_ids
mentor_features = mentor_survey_results.astype(float)
mentor_features_ids = mentor_survey_ids
# mentees
# mentors

In [5]:
# Train-Test
train_idx = range(int(.8 * len(mentees)))
test_idx = list(set(range(len(mentees))) - set(train_idx))
sort_idx = np.argsort(feature_ids)  # TODO, Further shuffle matches
mentee_features, mentee_features_test = mentee_features[sort_idx][train_idx], mentee_features[sort_idx][test_idx]
feature_ids, feature_ids_test = feature_ids[sort_idx][train_idx], feature_ids[sort_idx][test_idx]
idx = [(mentee in feature_ids) for mentee in matches[:, 0]]
tidx = [(mentee in feature_ids_test) for mentee in matches[:, 0]]
matches, matches_test = matches[idx], matches[tidx]
matches_outcome, matches_outcome_test = matches_outcome[idx], matches_outcome[tidx]
# Finalize dim variables
num_mentors = len(mentors)
num_mentees = len(train_idx)
num_matches = len(matches)
num_features = mentee_features.shape[1]
num_mentors_test = len(mentors)
num_mentees_test = len(test_idx)
num_matches_test = len(matches_test)
num_features_test = mentee_features.shape[1]

# Implement fake negatives (optional)
fake_percentage = .5
fake_num = int(fake_percentage * num_matches / (1 - fake_percentage))
fake_mentees = np.random.choice(range(num_mentees), fake_num, replace=True)
fake_mentors = np.random.choice(range(num_mentors), fake_num, replace=True)
fake_matches = np.stack((fake_mentees, fake_mentors), axis=1)
fake_outcome = np.zeros(fake_num)
# Append
matches = np.concatenate((matches, fake_matches), axis=0)
matches_outcome = np.concatenate((matches_outcome, fake_outcome), axis=0)
num_matches = len(matches)

# Cast to type
matches = torch.Tensor(matches.astype(float)).int()
matches_outcome = torch.Tensor(matches_outcome).float()
matches_test = torch.Tensor(matches_test.astype(float)).int()
matches_outcome_test = torch.Tensor(matches_outcome_test).float()
mentee_features = torch.Tensor(mentee_features).float()
mentee_features_test = torch.Tensor(mentee_features_test).float()

# Autoencoder KNN Model

In [6]:
# User Variables
dim_embedding = 10
epochs = 2001
batches = 20
lr = .001
epoch_pd = 200
batch_size = 64
batches = int(len(matches)/batch_size)

# Autoencoder
model = LatentModel(num_features, latent_dim=dim_embedding)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
criterion = nn.MSELoss()

In [7]:
model.train()
for epoch in range(epochs):
    for _ in range(batches):
        # Train
        optimizer.zero_grad()
        idx = np.random.choice(range(num_mentees), batch_size, replace=False)
        input_data = mentee_features[idx]
        _, reconst = model(input_data)
        loss = criterion(reconst, input_data)
        loss.backward()
        optimizer.step()
        
    if epoch % epoch_pd == 0:
        print(f'Epoch: {epoch}', end=' \t')
        print(f'Loss: {float(loss)}')
model.eval();

Epoch: 0 	Loss: 1.6940064430236816
Epoch: 200 	Loss: 0.033134836703538895
Epoch: 400 	Loss: 0.020022958517074585
Epoch: 600 	Loss: 0.018427524715662003
Epoch: 800 	Loss: 0.01828780397772789
Epoch: 1000 	Loss: 0.016249192878603935
Epoch: 1200 	Loss: 0.02792241796851158
Epoch: 1400 	Loss: 0.01245100423693657
Epoch: 1600 	Loss: 0.014294893480837345
Epoch: 1800 	Loss: 0.018847832456231117
Epoch: 2000 	Loss: 0.029806623235344887


## Evaluation

In [37]:
# Make KNN
KNN = KNeighborsClassifier(n_neighbors=10)
feat = model(mentee_features)[0].detach().cpu().numpy()
labels = matches[:, 1].numpy()
KNN.fit(feat[matches[:, 0]], labels)

# Predict
predicted = KNN.kneighbors(feat[matches[:, 0]])[1]
predicted = np.vectorize(lambda x: labels[int(x)])(predicted)[:, :5]
actual = matches[:, 1]
print(sum([int(ex) in pr for ex, pr in zip(actual, predicted)]) / len(actual))

feat_test = model(mentee_features_test)[0].detach().cpu().numpy()
predicted = KNN.kneighbors(feat_test[matches_test[:, 0] - len(train_idx)])[1]
predicted = np.vectorize(lambda x: labels[int(x)])(predicted)[:, :5]
actual = matches[:, 1]
print(sum([int(ex) in pr for ex, pr in zip(actual, predicted)]) / len(actual))

0.9940476190476191
0.005952380952380952
