In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

job_df = pd.read_csv("embedded_jobs.csv")
user_df = pd.read_csv("embedded_users.csv")

In [2]:
def convert_str_to_array(embed_str):
    embed_str = embed_str[1:-1]
    str_nums = embed_str.strip().split()
    return np.array([float(num) for num in str_nums if num], dtype=float)

user_df['user_embeddings'] = user_df['user_embeddings'].apply(convert_str_to_array)
job_df['job_embeddings'] = job_df['job_embeddings'].apply(convert_str_to_array)

change random state

In [3]:
sample_1_user = user_df.sample(n=2000, replace=True, random_state=0).reset_index(drop=True)
sample_1_job = job_df.sample(n=2000, replace=True, random_state=0).reset_index(drop=True)

In [5]:
user_embeddings = np.stack(sample_1_user['user_embeddings'].values)
job_embeddings = np.stack(sample_1_job['job_embeddings'].values)

In [6]:
similarity_matrix = cosine_similarity(user_embeddings, job_embeddings)

In [7]:
similarity_matrix

array([[ 0.06409579,  0.08736673,  0.08579661, ...,  0.06047326,
         0.43333627,  0.1071565 ],
       [ 0.06548012,  0.0382575 , -0.02327894, ...,  0.07977806,
         0.06991166,  0.02290556],
       [ 0.05159002,  0.10440023, -0.02228508, ...,  0.05652013,
         0.30451759,  0.08523632],
       ...,
       [ 0.13146621,  0.08600752,  0.01596827, ...,  0.05325247,
         0.18031842,  0.08367417],
       [ 0.10253367,  0.11754141,  0.05703012, ...,  0.12177801,
         0.16531908,  0.11984537],
       [ 0.16057843,  0.08132413,  0.05407259, ...,  0.1230199 ,
         0.21142906,  0.15153718]])

In [8]:
for label in ['top1_id', 'top1_score', 'top2_id', 'top2_score', 'bottom1_id', 'bottom1_score', 'bottom2_id', 'bottom2_score']:
    sample_1_user[label] = None

# Iterate through each user to find top 2 and bottom 2 jobs based on similarity
for i, similarities in enumerate(similarity_matrix):
    # Get indices of jobs sorted by similarity for this user
    sorted_indices = np.argsort(similarities)

    # Top 2 - highest similarity scores
    top_indices = sorted_indices[-2:][::-1]  # Reverse to have top1 before top2
    sample_1_user.at[i, 'top1_id'], sample_1_user.at[i, 'top2_id'] = sample_1_job.iloc[top_indices]['uniq_id'].values
    sample_1_user.at[i, 'top1_score'], sample_1_user.at[i, 'top2_score'] = similarities[top_indices]

    # Bottom 2 - lowest similarity scores
    bottom_indices = sorted_indices[:2]  # Already in ascending order
    sample_1_user.at[i, 'bottom1_id'], sample_1_user.at[i, 'bottom2_id'] = sample_1_job.iloc[bottom_indices]['uniq_id'].values
    sample_1_user.at[i, 'bottom1_score'], sample_1_user.at[i, 'bottom2_score'] = similarities[bottom_indices]

In [22]:
sorted(sample_1_user["top1_score"], reverse=True)[:10]

[0.6386429332292312,
 0.6350196097229882,
 0.6310751242290228,
 0.6288761952946105,
 0.6220248395075859,
 0.6218228210094235,
 0.6201664370383289,
 0.6180844462593503,
 0.6151718518331782,
 0.612866585291409]

In [46]:
sorted_user = sample_1_user.sort_values(by = "top1_score", ascending=False).reset_index(drop = True)

In [52]:
'''
for i in range(10):
   top1_score = sorted_user["top1_score"][i]
   top2_score = sorted_user["top2_score"][i]
   top1_id = sorted_user["top1_id"][i]
   top2_id = sorted_user["top2_id"][i]
   text = sorted_user["processed_text"][i]

   for row_num, row in sample_1_job.iterrows():
      j = 0
      if row["uniq_id"] == top1_id:
         job_text1 = row["processed_text"]
         print(f"Top 1 Score: {top1_score}, Job Desc: {job_text1}, User Desc: {text}")
         j += 1
      elif row["uniq_id"] == top2_id:
         job_text2 = row["processed_text"]
         print(f"Top 2 Score: {top2_score}, Job Desc: {job_text2}, User Desc: {text}")
         j += 1
      if j == 2:
         break
'''

Top 2 Score: 0.5650869454879932, Job Desc: python-engineer job title : python engineerlocation : midtown nycjob type : contractduration : long termcontact info : matt o'brien - mobrien @ techlink.com - 201-786-2415python engineer cloud aws puppet exp.key technology : • nginx haproxy apache wsgi• git gitflow• python web framework flask django• unit testing using framework unittest• understanding test automation tool selenium• apm tool newrelicinfrastructure : • aws : least 3+ years• docker : 1+ years• thorough understanding tcp ip http• understanding network security various encryption standards• able configure tune web servers• thorough understanding linux reasonable knowledge linux kernel• asic understanding puppet scm tool development language : • python : least 5 year experience• go : 1+ years• javasscript framework bootstrap angularjs• html5 templating frameworks• basic understanding bash scripting• must understanding modular application oop• boto sdk cloud tool terraform• json yam

In [11]:
value = sample_1_job.loc[sample_1_job['uniq_id'] == '3a087ce73ab876633f799ae6e94e0023', "Job_Features_Merged"].iloc[0]
print(value)
print(sample_1_user['Tech_Features_Merged'][0])

Python Developer, AWS If you’re looking for a career that transforms, inspires, challenges, and rewards you, then come join us! Verisk Analytics is a global supplier of risk assessment services and decision analytics for customers in a variety of markets, including insurance, healthcare, financial services, supply chain, and others. We’re a thriving public company with solid revenue growth and earnings and offices worldwide. And we’re continually looking for ways to augment our existing markets and expand into new markets with excellent growth potential. At Verisk, you’ll be part of an organization that’s committed to serving the long-term interests of our stakeholders, including the communities where we operate.Requirements:• Conceptual knowledge of broad cloud computing architectures in general, and AWS in particular. Must understand cloud networking and the shared security model.• Two years of hands on experience with various AWS services such as VPC, Security Groups, Network ACL’s,