## minimum viable product:
* extract tags per survey (dataframe cleaned up per survey response, this is done!)
* create a score matrix, n by m (n mentors by m mentees) (e.g. can be accessed as a dataframe)
* create a function that compares all mentors and mentees by tags and assigns a score to n by m
* extract top 3-5 highest scoring matches per mentee
* apply stable marriage problem method
* output matches

In [ ]:
# importing libraries
!pip install matching
import pandas as pd
import numpy as np
from matching import Player
from matching.games import StableMarriage
from matching.games import HospitalResident

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [ ]:
# reading in data and applying cosmetic fixes
df = pd.read_csv('responses.csv')
# df.columns
df = df.drop(['Mentor: Profession.1'], axis=1)
# rename columns
df.columns = ['Timestamp','mOs','Mindustry','Bregion','mWhy','Mwhy',
              'gender', 'yearsExp', 'CompSize', 'Profession', 'track',
              'ethnicity', 'Growth', 'Groups']
# df.columns
# df.head()

In [ ]:
# MAPPING OVERLAPING COLUMNS WHY AND PROFESSION
# mapping "why you want/be a mentor to one column"
var = pd.DataFrame(df['mOs'])
var = var.join(df['mWhy'])
var = var.join(df['Mwhy'])

# looping through df to match mentor and mentee why to 1 array
new_col =[]
for index, row in var.iterrows():
    if(row['mOs'] == 'Mentor'):
        new_col.append(row['Mwhy'])
    else:
        new_col.append(row['mWhy'])

# creating new column of combined why with new column arry
var['combined_why'] = new_col
var['combined_why'].fillna("Enhance CV", inplace = True) # replacing nulls

# adding profession and track columns
var = var.join(df['Profession'])
var = var.join(df['track'])

# mapping profession to track in a new column

mappedprofs = {'Data and Analytics': 'IT', 'Help Desk':'IT', 'Cyber Security':'IT', 'Quality Assurance':'IT', 'Operations':'Financial Operations', 'Finance':'Financial Operations', 'Anti-Money Laundering':'Financial Operations', 
                'Community Management':'Sales and Customer Support', 'Marketing':'Sales and Customer Support', 'Customer Support':'Sales and Customer Support', 'Educator':'Sales and Customer Support', 'Project Management':'Business Operations', 'Content':'Business Operations', 'Media Production':'Business Operations', 
                'Business Development':'Business Operations', 'Sales':'Sales and Customer Support', 'Human Resources':'Business Operations', 'Executive or Founder':'Business Operations', 'Engineering':'Software Development', 'Visual Design':'Software Development', 'UX/UI Design':'Software Development', 'Product Management':'Software Development'}

maptracks = []
for index,row in var.iterrows(): 
  if(row['mOs'] == 'Mentor'):
       maptracks.append(mappedprofs[row['Profession']])
  else:
       maptracks.append(row['track'])

var['combined_track'] = maptracks
var['combined_track'].fillna("IT", inplace = True) # replacing nulls


In [ ]:
# create a dataframe with columns to be used for matching 
match_df1 = df[['mOs', 'Bregion', 'gender', 'ethnicity', 'Growth', 'Groups']]
match_df2 = var[['combined_why', 'combined_track']]
# match dataframe combined!
match_df = match_df1.join(match_df2)
match_df.head()

Unnamed: 0,mOs,Bregion,gender,ethnicity,Growth,Groups,combined_why,combined_track
0,Mentor,Marin,she/her,Hispanic or Latinx,Networking,LGBTQ+,Develop Leadership Qualities,Business Operations
1,Student,Alameda,he/him,Native American or American Indian,Communication,Female-Identifying,Develop Leadership Qualities,Financial Operations
2,Mentor,Santa Clara,she/her,Asian,Professional Development,"Person of Color, Female-Identifying",Reinforce Industry Knowledge,Software Development
3,Student,Contra Costa,she/her,Caucasion,Technical Skills,"Female-Identifying, Person living with a Disab...",Reinforce Industry Knowledge,Financial Operations
4,Mentor,San Mateo,she/her,Asian,Networking,Immigrant,Improving Communication Skills,Business Operations


In [ ]:
# create a new column with random ID for every person 

## old approach, id was equal to a number from 1 to size of DF
# unique_id = range(0, match_df.shape[0])
# match_df["ID"] = unique_id
# match_df

# UID randomly generated, letters all lowercase from a-z
import random 
import string 
letters = string.ascii_lowercase
unique_id = [] 
for i in range(0, match_df.shape[0]): 
    unique_id.append(( ''.join(random.choice(letters) for i in range(10)) ))

# creating column of unique ids  
match_df["ID"] = unique_id

In [ ]:
# creating new dataframe match_sc, and filling it with 0's, holding scores of mentor/student comparison
student_subset_df = match_df[match_df['mOs'] == 'Student']
mentor_subset_df = match_df[match_df['mOs']== 'Mentor']

match_sc = pd.DataFrame(np.zeros(shape = (student_subset_df.shape[0],mentor_subset_df.shape[0])))
match_sc.columns = student_subset_df['ID']

In [ ]:
# Resetting the index to row names of mentor IDs 
mentor_subset_df.reset_index(drop=True, inplace=True)
match_sc['mentor_id'] = mentor_subset_df['ID']
match_sc.set_index('mentor_id', inplace = True)

In [ ]:
match_sc.head()

Unnamed: 0_level_0,dmmkxquseb,vugkiivvum,mdwcmsntin,gamomtqrjg,jdqsckwlrh,rosesaqxbg,iczikaojqi,msioztynxy,tspzrlnsix,cofiksrchi,...,hcoxiwzrco,egntaqdwns,trsmjpbhtf,coizwrrwnt,yqjyezsrue,fzxbkenyvu,fsxidpmuam,nmcuxzlnqm,jyniohlvjh,xmrdiiqdmc
mentor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
iwhezibrwj,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
omxsdvyuch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
gqkecncnch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sdwgmfnofm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
yjhpusifiy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [ ]:
match_df.head()

Unnamed: 0,mOs,Bregion,gender,ethnicity,Growth,Groups,combined_why,combined_track,ID
0,Mentor,Marin,she/her,Hispanic or Latinx,Networking,LGBTQ+,Develop Leadership Qualities,Business Operations,iwhezibrwj
1,Student,Alameda,he/him,Native American or American Indian,Communication,Female-Identifying,Develop Leadership Qualities,Financial Operations,dmmkxquseb
2,Mentor,Santa Clara,she/her,Asian,Professional Development,"Person of Color, Female-Identifying",Reinforce Industry Knowledge,Software Development,omxsdvyuch
3,Student,Contra Costa,she/her,Caucasion,Technical Skills,"Female-Identifying, Person living with a Disab...",Reinforce Industry Knowledge,Financial Operations,vugkiivvum
4,Mentor,San Mateo,she/her,Asian,Networking,Immigrant,Improving Communication Skills,Business Operations,gqkecncnch


In [ ]:
# method returns count of matching objects in array agnostic of index/positionw
def group_equal(student, mentor): 
    count = 0
    for x in student: 
        for y in mentor: 
            if x == y: 
                count += 1
    return count 

In [ ]:
# add up the scores including if the region is the same or not

# region - does tedd have a hard requirement on regions for mentors/mentees matching? 
#          if yes, we set the entire score to 0, nullifies the possibility of a match
#          this constraint is not currently enforced.

def final_score(region, no_group, with_group):
    # if(region == 0):     #if the mentor/mentee have dif regions
    #     return 0
    # else:
        # return (no_group + with_group)
    return (no_group + with_group)

In [ ]:
# computing student/mentor scores. scores added to match_sc
# scores are used to rank the preferences of students to mentors and mentors to students.

for x in match_sc:
    student = match_df[match_df['ID'] == x]
    for y in match_sc.iterrows():
        mentor = match_df[match_df['ID'] == y[0]]
        # capturing matching values between student and mentor w/o group
        student_wo_group = student.drop(['Groups'], axis =1)
        mentor_wo_group = mentor.drop(['Groups'], axis =1)
        score_wo_group = np.count_nonzero(student_wo_group.values==mentor_wo_group.values)
        # capturing region comapirson score
        score_region_comparison = np.count_nonzero(student['Bregion'].values==mentor['Bregion'].values)
        # splitting group column by comma for student and mentor
        student_group = student['Groups'].values[0].split(', ')
        mentor_group = mentor['Groups'].values[0].split(', ')
        # calling group_equal to compare all matching group tags and returns count
        score_group_comparison = group_equal(student_group, mentor_group)
        # replacing value in match_sc with score of student/mentor comparison
        match_sc[x][y[0]] = final_score(score_region_comparison, score_wo_group, score_group_comparison)

KeyboardInterrupt: 

In [0]:
match_sc

In [0]:
# top five ranking of students for mentors
# complete ranking of students for mentors
mentor_pref_dict_5 = {}
mentor_pref_dict = {}
for x,y in match_sc.iterrows():
    mentor_pref_dict_5[x] = np.array(y.nlargest().index.values)
    mentor_pref_dict[x] = np.array(y.nlargest(len(y)).index.values)

In [0]:
# top five rankings of mentors for students
# complete ranking of mentors for students
student_pref_dict_5 = {} #store in dict
student_pref_dict = {} #store in dict
for x in match_sc:
    student_pref_dict_5[x] = np.array(match_sc[x].nlargest().index.values)
    student_pref_dict[x] = np.array(match_sc[x].nlargest(len(match_sc[x])).index.values)

In [0]:
### STABLE MARRIAGE PLAYER SETUP WITH PREFERENCES SET

# creating an array of mentor players
# creating an array of student players

mentors_ids = list(mentor_pref_dict.keys()) #list of ids for mentors
students_ids = list(student_pref_dict.keys())  #list of ids for students

mentor_list_player = [Player(name = i) for i in mentors_ids] 
student_list_player = [Player(name = i) for i in students_ids]

# setting preferences for each mentor in mentor players array
for x in mentor_list_player: 
    students_temp = mentor_pref_dict[x.name]
    students_temp = [Player(name = i) for i in students_temp]
    x.set_prefs(students_temp)

# setting preferences for each student in student players array
for y in student_list_player: 
    mentors_temp = student_pref_dict[y.name]
    mentors_temp = [Player(name=i) for i in mentors_temp]
    y.set_prefs(mentors_temp)

#final match between mentors and mentee
#from matching.games import StableMarriage

#match_result = StableMarriage(mentor_list_player, student_list_player)
#match_result = StableMarriage(student_list_player,mentor_list_player)
#match_result.solve()

In [0]:
# ATTEMPTING HOSPITAL RESIDENT EXAMPLE 
# this worked when passing the entire set of rankings for students & mentors

# capacity set to 1, only 1 mentor to each student
capacities = {mentor: 1 for mentor in mentor_pref_dict}

# documentation: https://github.com/daffidwilde/matching
game = HospitalResident.create_from_dictionaries(student_pref_dict, mentor_pref_dict, capacities)

In [0]:
# see above for params we pass to create the game
# this step computes the matches, 26 in this case
matches = game.solve()

In [0]:
# printing out matches, {key= mentor, value = [student]}
matches

In [0]:
len(matches)

In [0]:
# score matrix of all matches
match_sc
