In [1]:
import pandas as pd
import numpy as np
from scipy.special import softmax

usecols=['Respondent',        #id
  'FormalEducation',    #degree
  'UndergradMajor',     #major
  'LanguageWorkedWith', 'DatabaseWorkedWith', 'PlatformWorkedWith', 'IDE', 'OperatingSystem', #skills
  'YearsCoding',        #projectExperience
  'Hobby',              #hobby
  'DevType',            #desiredJob
  ]

user_path = r'E:\OneDrive - National University of Singapore\code\NUS-ISS\Project1\IRS-PM-2024-09-10-GRP8-JobRecommendationSystem\Recommendation_System\New_User\user_clean.csv'

df = pd.read_csv(user_path, usecols=usecols)
df = df.rename(columns={
  'Respondent': 'id',
  'FormalEducation': 'degree',
  'UndergradMajor': 'major',
  'LanguageWorkedWith': 'skills_1',
  'DatabaseWorkedWith': 'skills_2',
  'PlatformWorkedWith': 'skills_3',
  'IDE': 'skills_4',
  'OperatingSystem': 'skills_5',
  'YearsCoding': 'projectExperience',
  'Hobby': 'hobby',
  'DevType': 'desiredJob',
})

df.head()

Unnamed: 0,id,hobby,degree,major,desiredJob,projectExperience,skills_1,skills_2,skills_3,skills_4,skills_5
0,569,Yes,"Other doctoral degree (Ph.D, Ed.D., etc.)","Another engineering discipline (ex. civil, ele...",Data scientist or machine learning specialist;...,12-14 years,C++;Julia;Python,,Linux,Atom;IntelliJ;Notepad++;PyCharm;Sublime Text,Linux-based
1,1142,Yes,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Back-end developer;Front-end developer;Full-st...,6-8 years,C++;Java;JavaScript;SQL;HTML;CSS;Bash/Shell,MongoDB;Oracle;Google Cloud Storage;Elasticsearch,,Eclipse;IntelliJ;Notepad++;Vim,Linux-based
2,1886,Yes,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Back-end developer;Desktop or enterprise appli...,3-5 years,C#;JavaScript;HTML;CSS,SQL Server;MySQL;SQLite,Windows Desktop or Server,Visual Studio,Windows
3,2208,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Information systems, information technology, o...",Back-end developer;Front-end developer;Full-st...,9-11 years,,,,,
4,3285,Yes,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",Back-end developer;Front-end developer;Full-st...,9-11 years,C#;JavaScript;Python;SQL;TypeScript;HTML;CSS,SQL Server,Android;AWS;Google Cloud Platform/App Engine;W...,Android Studio;Notepad++;Visual Studio;Visual ...,Windows


In [2]:
df['skills_1'] = df['skills_1'].str.split(';')
df['skills_2'] = df['skills_2'].str.split(';')
df['skills_3'] = df['skills_3'].str.split(';')
df['skills_4'] = df['skills_4'].str.split(';')
df['skills_5'] = df['skills_5'].str.split(';')

df['skills'] = df[['skills_1', 'skills_2', 'skills_3', 'skills_4', 'skills_5']].apply(lambda x: sum([i for i in x if isinstance(i, list)], []), axis=1)
df.drop(columns=['skills_1', 'skills_2', 'skills_3', 'skills_4', 'skills_5'], inplace=True)
# df = pd.concat([df_onehot, df['skills']], axis=1)

In [3]:
df['hobby'] = df['hobby'].apply(lambda x: 'True' if x == 'Yes' else 'False')
df['desiredJob'] = df['desiredJob'].str.split(';')
df['desiredJob'] = df[['desiredJob']].apply(lambda x: sum([i for i in x if isinstance(i, list)], []), axis=1)
# df['desiredJob'] = df['desiredJob'].apply(lambda x: [] if pd.isnull(str(x)) else x)

In [4]:
new_data = {
    'id': 0,
    'hobby': 'True',
    'degree': 'Master’s degree (MA, MS, M.Eng., MBA, etc.)',
    'major': 'Computer science, computer engineering, or software engineering',
    'desiredJob': ['Software Engineer', 'Data Scientist'],
    'projectExperience': '0-2 years',
    'skills': ['Visual Studio', 'Python', 'C', 'Pycharm', 'Matlab', 'C++', 'SQL', 'Eclipse']
}

df = df._append(new_data, ignore_index=True)

In [5]:
df_onehot = pd.get_dummies(df['degree'])
df['degree'] = df_onehot.apply(lambda x: x.values.tolist(), axis=1)
df_onehot = pd.get_dummies(df['major'])
df['major'] = df_onehot.apply(lambda x: x.values.tolist(), axis=1)
df_onehot = pd.get_dummies(df['projectExperience'])
df['projectExperience'] = df_onehot.apply(lambda x: x.values.tolist(), axis=1)

In [6]:
df

Unnamed: 0,id,hobby,degree,major,desiredJob,projectExperience,skills
0,569,True,"[False, False, False, True, False, False, False]","[False, False, False, False, False, True, Fals...",[Data scientist or machine learning specialist...,"[False, True, False, False, False, False, Fals...","[C++, Julia, Python, Linux, Atom, IntelliJ, No..."
1,1142,True,"[False, True, False, False, False, False, False]","[False, False, False, False, False, False, Tru...","[Back-end developer, Front-end developer, Full...","[False, False, False, False, False, False, Fal...","[C++, Java, JavaScript, SQL, HTML, CSS, Bash/S..."
2,1886,True,"[False, True, False, False, False, False, False]","[False, False, False, False, False, False, Tru...","[Back-end developer, Desktop or enterprise app...","[False, False, False, False, False, False, Fal...","[C#, JavaScript, HTML, CSS, SQL Server, MySQL,..."
3,2208,False,"[False, False, True, False, False, False, False]","[False, False, False, False, False, False, Fal...","[Back-end developer, Front-end developer, Full...","[False, False, False, False, False, False, Fal...",[]
4,3285,True,"[False, True, False, False, False, False, False]","[False, False, False, False, False, False, Tru...","[Back-end developer, Front-end developer, Full...","[False, False, False, False, False, False, Fal...","[C#, JavaScript, Python, SQL, TypeScript, HTML..."
...,...,...,...,...,...,...,...
276,79707,True,"[False, True, False, False, False, False, False]","[False, False, False, True, False, False, Fals...",[Desktop or enterprise applications developer],"[False, False, False, False, False, False, Fal...","[Groovy, Java, Python, Bash/Shell, Eclipse, Em..."
277,18712,True,"[False, False, True, False, False, False, False]","[False, False, False, False, False, False, Tru...","[Back-end developer, Full-stack developer]","[False, False, False, False, False, False, Fal...","[Java, VB.NET, MongoDB, SQL Server, Oracle, Ec..."
278,79245,True,"[False, False, True, False, False, False, False]","[False, False, False, False, False, False, Fal...","[Back-end developer, C-suite executive (CEO, C...","[False, False, False, False, False, False, Fal...",[]
279,90067,True,"[False, True, False, False, False, False, False]","[False, False, False, False, False, False, Tru...","[Full-stack developer, Mobile developer]","[False, False, False, False, False, False, Fal...",[]


In [7]:
# Function to calculate Jaccard similarity for binary vectors (one-hot encoded)
def jaccard_similarity_binary(vector1, vector2):
    set1, set2 = set(np.where(vector1 == 1)[0]), set(np.where(vector2 == 1)[0])
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0
    return intersection / union

# Function to calculate Jaccard similarity for lists of strings
def jaccard_similarity(list1, list2):
    set1, set2 = set(list1), set(list2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0
    return intersection / union


# Convert string representations of lists back to actual lists
list_columns = ['degree', 'major', 'desiredJob', 'projectExperience', 'skills']
for col in list_columns:
    df[col] = df[col].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Separate the new user (last row) from the rest of the data
new_user = df.iloc[-1]
other_users = df[:-1]

# Similarity calculation for each feature

# 1. Similarity for 'hobby' (boolean comparison)
hobby_similarity = other_users['hobby'].apply(lambda x: 10 if x == new_user['hobby'] else 0)

# 2. Similarity for one-hot encoded features (degree, major, projectExperience)
onehot_similarity = []
for idx, other_user in other_users.iterrows():
    degree_sim = jaccard_similarity_binary(np.array(new_user['degree']), np.array(other_user['degree']))
    major_sim = jaccard_similarity_binary(np.array(new_user['major']), np.array(other_user['major']))
    project_sim = jaccard_similarity_binary(np.array(new_user['projectExperience']), np.array(other_user['projectExperience']))
    onehot_similarity.append((degree_sim + major_sim + project_sim) / 0.3)

# 3. Similarity for lists of strings (desiredJob, skills)
list_similarity = []
for idx, other_user in other_users.iterrows():
    desiredJob_sim = jaccard_similarity(new_user['desiredJob'], other_user['desiredJob'])
    skills_sim = jaccard_similarity(new_user['skills'], other_user['skills'])
    list_similarity.append((desiredJob_sim + skills_sim) / 0.2)

# Combine similarities with equal weights
total_similarities = hobby_similarity / 6 + np.array(onehot_similarity) / 2 + np.array(list_similarity) / 3

# Create a DataFrame to display similarities
similarities_df = pd.DataFrame({
    'UserIndex': other_users.id,
    'HobbySimilarity': hobby_similarity,
    'OneHotSimilarity': onehot_similarity,
    'ListSimilarity': list_similarity,
    'TotalSimilarity': total_similarities
})

# Sort by total similarity
# similarities_df = similarities_df.sort_values(by='UserIndex', ascending=True)

# Display the top results
# Apply softmax to the 'TotalSimilarity' column
similarities_df['TotalSimilarity_softmax'] = softmax(similarities_df['TotalSimilarity'])

# Display the top results
similarities_df


Unnamed: 0,UserIndex,HobbySimilarity,OneHotSimilarity,ListSimilarity,TotalSimilarity,TotalSimilarity_softmax
0,569,10,0.000000,0.625000,1.875000,0.000469
1,1142,10,3.333333,0.714286,3.571429,0.002558
2,1886,10,3.333333,0.294118,3.431373,0.002223
3,2208,0,3.333333,0.000000,1.666667,0.000381
4,3285,10,3.333333,0.652174,3.550725,0.002505
...,...,...,...,...,...,...
275,41042,10,3.333333,0.882353,3.627451,0.002705
276,79707,10,0.000000,0.666667,1.888889,0.000475
277,18712,10,6.666667,0.666667,5.222222,0.013329
278,79245,10,3.333333,0.000000,3.333333,0.002016


In [8]:
data = np.load(r'E:\OneDrive - National University of Singapore\code\NUS-ISS\Project1\IRS-PM-2024-09-10-GRP8-JobRecommendationSystem\Recommendation_System\KGAT\trained_model\KGAT\ours\pretrain0\cf_scores.npy')
data[data == -np.inf] = 0
print(data)

[[2.5094125  0.5457798  0.86320585 ... 0.64694655 0.5428725  2.9614866 ]
 [1.7571821  0.4797321  0.7343855  ... 0.5962142  0.47354534 2.7825584 ]
 [1.8190633  0.4703961  0.7303033  ... 0.5873413  0.46456838 2.8192024 ]
 ...
 [1.7981231  0.48968816 0.7388602  ... 0.60602975 0.48801172 2.800873  ]
 [2.4156559  0.46187967 0.80723965 ... 0.58620405 0.46363518 2.9865808 ]
 [1.8656905  0.48238537 0.73182285 ... 0.5938782  0.4776994  2.8391533 ]]


In [9]:
# Ensure that similarities_df['TotalSimilarity_softmax'] is a numpy array
similarity_scores = np.array(similarities_df['TotalSimilarity_softmax'].tolist())

# Perform the dot product
result = np.dot(similarity_scores, data)

# Convert the result to a list
result_list = result.tolist() + [0,0,0,0]

item_list_path = r'E:\OneDrive - National University of Singapore\code\NUS-ISS\Project1\IRS-PM-2024-09-10-GRP8-JobRecommendationSystem\Recommendation_System\KGAT\datasets\ours\item_list.txt'

with open(item_list_path, 'r') as f:
    item_list = f.readlines()
    
data = [line.split() for line in item_list]
df = pd.DataFrame(data)
df['result_list'] = result_list
df.drop(columns=[0, 1], inplace=True)
df.rename(columns={2: 'job_id'}, inplace=True)
final_result = df.sort_values(by='result_list', ascending=False)

final_result

Unnamed: 0,job_id,result_list
1984,1532671992557778,2.877128
1234,2929311776829310,2.873531
752,2352153327231079,2.872782
5630,561762342039753,2.872605
3426,228099982613051,2.871764
...,...,...
7304,2945768634535682,0.362754
7421,3017880610488563,0.000000
7422,2670092777939025,0.000000
7423,158667622654558,0.000000


In [10]:
job_sg_path = r'E:\OneDrive - National University of Singapore\code\NUS-ISS\Project1\IRS-PM-2024-09-10-GRP8-JobRecommendationSystem\Recommendation_System\New_User\job_clean.csv'

job_sg_df = pd.read_csv(job_sg_path)

job_sg_df = job_sg_df[['Job Id', 'Job Title', 'Job Description', 'Company', 'skills', 'location', 'Work Type', 'Salary Range']]

In [11]:
# Ensure 'job_id' in final_result is of the same type as 'Job Id' in job_sg_df
final_result['job_id'] = final_result['job_id'].astype(str)
job_sg_df['Job Id'] = job_sg_df['Job Id'].astype(str)

# Merge final_result with job_sg_df on job_id and Job Id
merged_result = final_result.merge(job_sg_df, left_on='job_id', right_on='Job Id', how='left')

# Drop the redundant 'Job Id' column
merged_result.drop(columns=['Job Id'], inplace=True)

# Display the merged result
merged_result

Unnamed: 0,job_id,result_list,Job Title,Job Description,Company,skills,location,Work Type,Salary Range
0,1532671992557778,2.877128,Procurement Manager,"Analyze procurement data, identify cost-saving...",Hikma Pharmaceuticals,Procurement processes Vendor assessment Contra...,Singapore,Part-Time,$55K-$89K
1,2929311776829310,2.873531,HR Coordinator,Recruitment Coordinators support the hiring pr...,Diageo,Recruitment process Candidate screening Applic...,Singapore,Temporary,$61K-$118K
2,2352153327231079,2.872782,Marketing Analyst,"Analyze digital marketing campaigns, track per...",Fresenius SE & Co. KGaA,Digital marketing strategies Analytics and rep...,Singapore,Intern,$62K-$123K
3,561762342039753,2.872605,Procurement Manager,"Analyze procurement data, identify cost-saving...",Humana,Procurement processes Vendor assessment Contra...,Singapore,Full-Time,$58K-$83K
4,228099982613051,2.871764,Procurement Manager,Promote diversity and inclusion in the supply ...,Tractor Supply Company,Supplier diversity programs Diversity and incl...,Singapore,Intern,$56K-$121K
...,...,...,...,...,...,...,...,...,...
7420,2945768634535682,0.362754,Mechanical Designer,Create and modify technical drawings using com...,GlaxoSmithKline,Computer-aided design (CAD) software Technical...,Singapore,Intern,$60K-$86K
7421,3017880610488563,0.000000,Investment Analyst,A Risk Analyst assesses and manages financial ...,New York Life Insurance,Risk assessment Data analysis Financial modeli...,Singapore,Contract,$65K-$93K
7422,2670092777939025,0.000000,Supply Chain Manager,A Demand Planner analyzes historical sales dat...,ServiceNow,Demand forecasting Inventory management Data a...,Singapore,Part-Time,$56K-$88K
7423,158667622654558,0.000000,Event Planner,Wedding Planners specialize in organizing wedd...,Ceconomy AG (Formerly Metro AG),Wedding planning Venue selection Catering and ...,Singapore,Temporary,$65K-$113K


In [12]:
Jobs = []
for idx, result in merged_result.iterrows():
    job = {}
    job['jobId'] = result['job_id']
    job['jobTitle'] = result['Job Title']
    job['jobDescription'] = result['Job Description']
    job['companyName'] = result['Company']
    job['skills'] = result['skills']
    job['location'] = result['location']
    job['employmentType'] = result['Work Type']
    job['salary'] = result['Salary Range']
    Jobs.append(job)
Jobs

[{'jobId': '1532671992557778',
  'jobTitle': 'Procurement Manager',
  'jobDescription': 'Analyze procurement data, identify cost-saving opportunities, and support the procurement process.',
  'companyName': 'Hikma Pharmaceuticals',
  'skills': 'Procurement processes Vendor assessment Contract negotiation Purchase order management Spend analysis Market research Supplier relationship management Data analysis Problem-solving Attention to detail Communication skills Financial acumen',
  'location': 'Singapore',
  'employmentType': 'Part-Time',
  'salary': '$55K-$89K'},
 {'jobId': '2929311776829310',
  'jobTitle': 'HR Coordinator',
  'jobDescription': 'Recruitment Coordinators support the hiring process by scheduling interviews, screening candidates, and assisting with onboarding. They ensure a smooth recruitment process and maintain candidate records.',
  'companyName': 'Diageo',
  'skills': 'Recruitment process Candidate screening Applicant tracking systems Communication',
  'location': '

In [13]:
# from flask import Flask, jsonify

# app = Flask(__name__)

# # 模拟的推荐工作数据
# @app.route('/recommend_jobs', methods=['GET'])
# def recommend_jobs():
#     return jsonify(Jobs)

# if __name__ == '__main__':
#     app.run(host='0.0.0.0', port=5000)
