In [1]:
# Import necessary dependencies
import os
import sys
import pathlib
import json
from typing import Dict

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

import warnings
warnings.filterwarnings('ignore')

from cos_sim import *

# Define relative path
current_dir = os.getcwd()
project_dir = os.path.dirname(current_dir)

USERDATA = pathlib.Path(os.path.join(project_dir, 'userInfo'))

In [2]:
# Read in the input and an example
with open(os.path.join(USERDATA, 'rocco.json'), 'r') as file:
    info = json.load(file)

with open(os.path.join(USERDATA, 'input.json'), 'r') as inFile:
    allUsers = json.load(inFile)

In [3]:
# Create empty DataFrame for each role
data_science = pd.DataFrame()
backend = pd.DataFrame()
frontend = pd.DataFrame()
business = pd.DataFrame()

# Put users in the correct role bucket
for user in allUsers:
    data = pd.DataFrame([user])
    if user['role1'] == 'data science':
        data_science = pd.concat([data_science, data], ignore_index=True)
    elif user['role1'] == 'back-end':
        backend = pd.concat([backend, data], ignore_index=True)
    elif user['role1'] == 'front-end':
        frontend = pd.concat([frontend, data], ignore_index=True)
    elif user['role1'] == 'business':
        business = pd.concat([business, data], ignore_index=True)

# List of role tables
role_tables = [data_science, backend, frontend, business]

# DataFrames of new tables used for vectorization
vec_ds = pd.DataFrame()
vec_be = pd.DataFrame()
vec_fe = pd.DataFrame()
vec_bs = pd.DataFrame()

# Drop unnecessary columns
for i, table in enumerate(role_tables):
    vec_table = table.drop(columns=['school', 'note', 'discordLink'], errors='ignore')
    if i == 0:
        vec_ds = vec_table
    elif i == 1:
        vec_be = vec_table
    elif i == 2:
        vec_fe = vec_table
    elif i == 3:
        vec_bs = vec_table

# List of tables used for vectorization
vec_tables = [vec_ds, vec_be, vec_fe, vec_bs]

In [4]:
# Vectorize all users in all tables
vec_tables = align_columns([vectorize(table) for table in vec_tables])

# Dictionary to keep the respective role for each vectorized DataFrames
vec_dict = {
    "data science": vec_tables[0],
    "back-end": vec_tables[1],
    "front-end": vec_tables[2],
    "business": vec_tables[3]
}

In [5]:
vec_tables[0]

Unnamed: 0,experienceLevel_beginner,experienceLevel_expert,experienceLevel_intermediate,goal_gain experience,goal_have fun,goal_networking,goal_win hackathon,name,primary_c#,primary_c++,...,secondary_python,secondary_r,secondary_react,secondary_sql,secondary_tableau,trait_analytical,trait_collaborative,trait_efficient,trait_flexible,userId
0,False,0,True,True,0,0,0,thanh,0,0,...,0,0,0,0,0,True,False,0,0,56781234
1,True,0,False,True,0,0,0,hudson,0,0,...,0,0,0,0,0,False,True,0,0,67812345


In [6]:
# Convert the dictionary to Pandas DataFrame
info = pd.DataFrame([info])

# Get the vector for the user
userVector = align_single_user(
    user_vector=vectorize(info=info).drop(
        columns=['school', 'note', 'discordLink'],
        errors='ignore'
    ),
    reference_columns=vec_tables[0].columns
)

In [7]:
# Compare cosine similarity between the user and rows with different primary roles
sorted_similarity_tables = compare_cos_sim(
    user_vector=userVector,
    vec_tables=vec_dict
)

In [8]:
# Create output
output = []

for role, table in sorted_similarity_tables.items():
    recommendations = []

    for i, row in table.iterrows():
        id = row['userId']
        
        if role == 'data science':
            matching_row = data_science[data_science['userId'] == id]
        elif role == 'back-end':
            matching_row = backend[backend['userId'] == id]
        elif role == 'front-end':
            matching_row = frontend[frontend['userId'] == id]
        elif role == 'business':
            matching_row = business[business['userId'] == id]
            
        recommendations.append(
            {
                "userId": matching_row['userId'].iloc[0],
                "name": matching_row['name'].values[0],
                "experienceLevel": matching_row['experienceLevel'].values[0],
                "role1": matching_row['role1'].values[0],
                "role2": matching_row['role2'].values[0],
                "primaryLanguages": matching_row['primaryLanguages'].values[0],
                "secondaryLanguages": matching_row['secondaryLanguages'].values[0],
                "school": matching_row['school'].values[0],
                "goal": matching_row['goal'].values[0],
                "note": matching_row['note'].values[0],
                "trait": matching_row['trait'].values[0],
                "discordLink": matching_row['discordLink'].values[0]
            }
        )

    # Append the recommendations for each role to the output
    output.append(recommendations)

In [9]:
output[0]

[{'userId': 56781234,
  'name': 'thanh',
  'experienceLevel': 'intermediate',
  'role1': 'data science',
  'role2': 'back-end',
  'primaryLanguages': ['python', 'r', 'sql'],
  'secondaryLanguages': ['c++', 'c', 'java'],
  'school': 'ua',
  'goal': 'gain experience',
  'note': 'optional here',
  'trait': 'analytical',
  'discordLink': 'discord link here'},
 {'userId': 67812345,
  'name': 'hudson',
  'experienceLevel': 'beginner',
  'role1': 'data science',
  'role2': 'business',
  'primaryLanguages': ['python', 'sql', 'julia'],
  'secondaryLanguages': ['powerpoint', 'c#', 'excel'],
  'school': 'ua',
  'goal': 'gain experience',
  'note': 'optional here',
  'trait': 'collaborative',
  'discordLink': 'discord link here'}]