In [None]:
import pandas as pd 
import numpy as np 
import random 
import string 
from ast import literal_eval
from matching import Player  
from matching.games import HospitalResident
import joblib # for saving algorithm and preprocessing objects

In [None]:
# replacing all semicolons with commas - this is a default in the django survey library
def fix_df_arrays(df):
    return df.replace(regex=';', value =',') 

In [None]:
#this function compiles an array of unique ids for the mentors and mentees 
#  and adds the column of ids to the df 
# @df: the dataframe that we are iterating through to assign user ids 
# import string
# import random
def create_user_id(survey_df): 
    letters = string.ascii_lowercase
    unique_id = [] 
    for i in range(0, survey_df.shape[0]): 
        unique_id.append(( ''.join(random.choice(letters) for i in range(10)) ))

    # creating column of unique ids  
    survey_df["ID"] = unique_id
    return survey_df

In [None]:
# this function creates score matrix with 0's
# @df: the dataframe we are iterating through in order to add 0's to it 
def create_score_matrix(survey_df):
    student_subset_df = survey_df[survey_df['Are you a Mentor or Student?'] == 'student']
    mentor_subset_df = survey_df[survey_df['Are you a Mentor or Student?']== 'mentor']

    score_matrix = pd.DataFrame(np.zeros(shape = (student_subset_df.shape[0],mentor_subset_df.shape[0])))
    score_matrix.columns = student_subset_df['ID']
    return score_matrix 

In [None]:
#assigning unique ids to the mentors and mentees 
def assign_id(survey_df, score_matrix): 
    mentor_subset_df = survey_df[survey_df['Are you a Mentor or Student?']== 'mentor']
    mentor_subset_df.reset_index(drop=True, inplace=True)  #only thing is how to gain access to the df here 
    score_matrix['mentor_id'] = mentor_subset_df['ID']
    score_matrix.set_index('mentor_id', inplace = True)
    return score_matrix 

In [None]:
#this function determines the final score for the mentors and mentees 
def final_score(region, no_group, with_group):
    # if(region == 0):     #if the mentor/mentee have dif regions
    #     return 0
    # else:
        # return (no_group + with_group)
    return (no_group + with_group)

In [None]:
# method returns count of matching objects in array agnostic of index/positionw
# @student: the student array that we are iterating throguh 
# @mentor: the mentor array that we are iterating through 

def count_equal_responses(student, mentor):
    count = 0
    if isinstance(student, (int, np.integer)):
        count += 0
    elif '[' in student:
        student = literal_eval(student)
        mentor = literal_eval(mentor)
        for x in student: 
            for y in mentor:
                if x == y: 
                    count += 1
    elif ( '[' not in student and type(student) is str):
        if student == mentor:
            count += 1
    return count

In [None]:
def calculate_match_scores(score_matrix, survey_df):
    # iterating over students in score matrix
    for i in score_matrix:
        # getting student[i] survey responses from survey dataframe
        student = survey_df[survey_df['ID'] == i]
        for j in score_matrix.iterrows():
            # getting mentor[j] survey responses from survey dataframe
            mentor = survey_df[survey_df['ID'] == j[0]]
            # time to compare student to every mentor and get a score
            score = 0
            for col in survey_df.columns:
                count = count_equal_responses(student[col].squeeze(), mentor[col].squeeze())
                score += count
            score_matrix[i][j[0]] = score
    return score_matrix

In [None]:
#this function will return the top 5 matches of students for each mentor 
#@df: the dataframe with the scoress that we are iterating through 
def top_matches_mentor(score_matrix): 
    mentor_pref_dict_5 = {}
    mentor_pref_dict = {}
    for x,y in score_matrix.iterrows():
        mentor_pref_dict_5[x] = np.array(y.nlargest().index.values)
        mentor_pref_dict[x] = np.array(y.nlargest(len(y)).index.values)
    return mentor_pref_dict, mentor_pref_dict_5

In [None]:
#this function will return the top 5 mentor matches for each student 
#@df: the dataframe with the scores for mentor/mentee 
def top_matches_student(score_matrix): 
    student_pref_dict_5 = {} #store in dict
    student_pref_dict = {} #store in dict
    for x in score_matrix:
        student_pref_dict_5[x] = np.array(score_matrix[x].nlargest().index.values)
        student_pref_dict[x] = np.array(score_matrix[x].nlargest(len(score_matrix[x])).index.values)
    return student_pref_dict, student_pref_dict_5

In [None]:
# @mentordict: the mentor dictionary with the top matches of students 
# returns a df with the matches 
def get_mentor_match(mentor_pref_dict, student_pref_dict): 
    capacities = {mentor: 1 for mentor in mentor_pref_dict}

    # documentation: https://github.com/daffidwilde/matching
    game = HospitalResident.create_from_dictionaries(student_pref_dict, mentor_pref_dict, capacities)
    matches = game.solve()
    
    return matches

In [None]:
def matches_to_json(matches, survey_df):
    # getting array of mentor ifs from game results
    mentor_ids = list(matches.keys())
    mentor_ids = [str(item) for item in mentor_ids]

    # getting array of student ids from game results
    student_ids = list(matches.values())
    student_ids = [str(item[0]) for item in student_ids]

    # creating user id list
    user_ids = mentor_ids + student_ids

    # creating match list
    matches_ids = student_ids + mentor_ids

    # creating dataframe with mentor and student matches
    matches_df = pd.DataFrame(user_ids,columns = ['User IDs'])
    matches_df['Match IDs'] = matches_ids
    print(matches_df)

    # replacing randomly generated IDs with database user ids
    user_id = []
    match_id = []
    for mentor in matches_df['User IDs']:
        user_id.append(int(survey_df.loc[survey_df['ID'] == mentor]['user_id'].values))

    for student in matches_df['Match IDs']:
        match_id.append(int(survey_df.loc[survey_df['ID'] == student]['user_id'].values))

    matches_df['User IDs'] = user_id
    matches_df['Match IDs'] = match_id
    print(matches_df)
    return matches_df.to_json()

In [None]:
def test():
    df = pd.read_csv('survey.csv')
    df = fix_df_arrays(df)
    df = create_user_id(df)
    score_matrix = create_score_matrix(df)
    score_matrix = assign_id(df, score_matrix)
    score_matrix = calculate_match_scores(score_matrix, df)
    mentor_pref_dict, mentor_pref_dict_5 = top_matches_mentor(score_matrix)
    student_pref_dict, student_pref_dict_5 = top_matches_student(score_matrix)
    matches = get_mentor_match(mentor_pref_dict, student_pref_dict)
    match_json_result = matches_to_json(matches, df)
    print(match_json_result)

In [None]:
joblib.dump(fix_df_arrays, "./fix_df_arrays.joblib", compress=True)
joblib.dump(create_user_id, "./create_user_id.joblib", compress=True)
joblib.dump(create_score_matrix, "./create_score_matrix.joblib", compress=True)
joblib.dump(assign_id, "./assign_id.joblib", compress=True)
joblib.dump(calculate_match_scores, "./calculate_match_scores.joblib", compress=True)
joblib.dump(top_matches_mentor, "./top_matches_mentor.joblib", compress=True)
joblib.dump(top_matches_student, "./top_matches_student.joblib", compress=True)
joblib.dump(get_mentor_match, "./get_mentor_match.joblib", compress=True)
joblib.dump(matches_to_json, "./matches_to_json.joblib", compress=True)