# Loop 20 times to geneerate user subsamples

- Run once to get 20 user subsamples with each 2000 rows
- Each user has labels with two courses with top 2 highest similarity Score
- Each user has labels with two courses with bottom 2 highest similarity Score


In [86]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def convert_str_to_array(embed_str):
        embed_str = embed_str[1:-1]
        str_nums = embed_str.strip().split()
        return np.array([float(num) for num in str_nums if num], dtype=float)

for j in range(20):

    # Loop below code 20 times to get 20 different samples

    job_df = pd.read_csv("embedded_jobs.csv")
    user_df = pd.read_csv("embedded_users.csv")

    # Generate 20 random numbers for random_state
    seed = np.random.randint(1000)
    user_df['user_embeddings'] = user_df['user_embeddings'].apply(convert_str_to_array)
    job_df['job_embeddings'] = job_df['job_embeddings'].apply(convert_str_to_array)

    sample_1_user = user_df.sample(n=2000, replace=True, random_state=seed).reset_index(drop=True)
    sample_1_job = job_df.sample(n=2000, replace=True, random_state=seed).reset_index(drop=True)

    user_embeddings = np.stack(sample_1_user['user_embeddings'].values)
    job_embeddings = np.stack(sample_1_job['job_embeddings'].values)

    similarity_matrix = cosine_similarity(user_embeddings, job_embeddings)

    for label in ['top1_id', 'top1_score', 'top2_id', 'top2_score', 'bottom1_id', 'bottom1_score', 'bottom2_id', 'bottom2_score']:
        sample_1_user[label] = None

    # Iterate through each user to find top 2 and bottom 2 jobs based on similarity
    for i, similarities in enumerate(similarity_matrix):
        # Get indices of jobs sorted by similarity for this user
        sorted_indices = np.argsort(similarities)

        # Top 2 - highest similarity scores
        top_indices = sorted_indices[-2:][::-1]  # Reverse to have top1 before top2
        sample_1_user.at[i, 'top1_id'], sample_1_user.at[i, 'top2_id'] = sample_1_job.iloc[top_indices]['uniq_id'].values
        sample_1_user.at[i, 'top1_score'], sample_1_user.at[i, 'top2_score'] = similarities[top_indices]

        # Bottom 2 - lowest similarity scores
        bottom_indices = sorted_indices[:2]  # Already in ascending order
        sample_1_user.at[i, 'bottom1_id'], sample_1_user.at[i, 'bottom2_id'] = sample_1_job.iloc[bottom_indices]['uniq_id'].values
        sample_1_user.at[i, 'bottom1_score'], sample_1_user.at[i, 'bottom2_score'] = similarities[bottom_indices]


        sorted_user = sample_1_user.sort_values(by = "top1_score", ascending=False).reset_index(drop = True)


    # export sorted_user to csv
    # Naming convention: UserWithLabel_{number}_{random_state}.csv
    sorted_user.to_csv("User Samples/UserWithLabel_{}_{}.csv".format(j, seed), index=False)

# Step by Step

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

job_df = pd.read_csv("embedded_jobs.csv")
user_df = pd.read_csv("embedded_users.csv")

In [None]:
def convert_str_to_array(embed_str):
    embed_str = embed_str[1:-1]
    str_nums = embed_str.strip().split()
    return np.array([float(num) for num in str_nums if num], dtype=float)

user_df['user_embeddings'] = user_df['user_embeddings'].apply(convert_str_to_array)
job_df['job_embeddings'] = job_df['job_embeddings'].apply(convert_str_to_array)

change random state

In [None]:
sample_1_user = user_df.sample(n=2000, replace=True, random_state=10).reset_index(drop=True)
sample_1_job = job_df.sample(n=2000, replace=True, random_state=10).reset_index(drop=True)

In [None]:
user_embeddings = np.stack(sample_1_user['user_embeddings'].values)
job_embeddings = np.stack(sample_1_job['job_embeddings'].values)

In [None]:
user_embeddings.shape
# print first 5 rows of user_embeddings
user_embeddings[:5]

array([[ 0.01907146, -0.00829974,  0.01026976, ...,  0.03162427,
         0.00513665,  0.06053568],
       [ 0.06041826,  0.07092373, -0.0728156 , ..., -0.0166954 ,
         0.06578837,  0.05440273],
       [-0.01116717, -0.07107158,  0.02084285, ...,  0.0690934 ,
         0.02083637, -0.02229924],
       [-0.02518902, -0.05464349, -0.06197032, ...,  0.02155872,
         0.03550868,  0.04433848],
       [-0.0093353 ,  0.01102138, -0.02440982, ...,  0.00815124,
        -0.01781286,  0.01758946]])

In [None]:
similarity_matrix = cosine_similarity(user_embeddings, job_embeddings)

In [None]:
similarity_matrix

array([[ 0.17937348,  0.15273761,  0.31704955, ...,  0.10795576,
        -0.01074025,  0.11513021],
       [ 0.05742402,  0.07904482,  0.19574735, ...,  0.06015576,
        -0.08150383,  0.01688816],
       [ 0.28481567,  0.21558789,  0.28910754, ...,  0.11903735,
         0.00852393,  0.30544184],
       ...,
       [ 0.00417474, -0.01543332,  0.24896393, ...,  0.05084219,
         0.01065689, -0.06013272],
       [ 0.21396519,  0.29651304,  0.3797948 , ...,  0.2394706 ,
         0.07938975,  0.16682029],
       [ 0.06478971,  0.15015662,  0.17746876, ...,  0.09444445,
         0.09266419,  0.12881098]])

In [None]:
for label in ['top1_id', 'top1_score', 'top2_id', 'top2_score', 'bottom1_id', 'bottom1_score', 'bottom2_id', 'bottom2_score']:
    sample_1_user[label] = None

# Iterate through each user to find top 2 and bottom 2 jobs based on similarity
for i, similarities in enumerate(similarity_matrix):
    # Get indices of jobs sorted by similarity for this user
    sorted_indices = np.argsort(similarities)

    # Top 2 - highest similarity scores
    top_indices = sorted_indices[-2:][::-1]  # Reverse to have top1 before top2
    sample_1_user.at[i, 'top1_id'], sample_1_user.at[i, 'top2_id'] = sample_1_job.iloc[top_indices]['uniq_id'].values
    sample_1_user.at[i, 'top1_score'], sample_1_user.at[i, 'top2_score'] = similarities[top_indices]

    # Bottom 2 - lowest similarity scores
    bottom_indices = sorted_indices[:2]  # Already in ascending order
    sample_1_user.at[i, 'bottom1_id'], sample_1_user.at[i, 'bottom2_id'] = sample_1_job.iloc[bottom_indices]['uniq_id'].values
    sample_1_user.at[i, 'bottom1_score'], sample_1_user.at[i, 'bottom2_score'] = similarities[bottom_indices]

In [None]:
sorted(sample_1_user["top1_score"], reverse=True)[:10]

[0.6470626271909341,
 0.6381081393682041,
 0.6354698306932636,
 0.6317341422251666,
 0.6313921487798048,
 0.6093074748454339,
 0.6082848621224705,
 0.6080725406255236,
 0.6060513650397545,
 0.6054078349199901]

In [None]:
sorted_user = sample_1_user.sort_values(by = "top1_score", ascending=False).reset_index(drop = True)

Export sorted_user to csv

In [None]:
# export sorted_user to csv
# Naming convention: UserWithLabel_{number}_{random_state}.csv
sorted_user.to_csv("User Samples/UserWithLabel_1_10.csv", index=False)

In [None]:
'''
for i in range(10):
   top1_score = sorted_user["top1_score"][i]
   top2_score = sorted_user["top2_score"][i]
   top1_id = sorted_user["top1_id"][i]
   top2_id = sorted_user["top2_id"][i]
   text = sorted_user["processed_text"][i]

   for row_num, row in sample_1_job.iterrows():
      j = 0
      if row["uniq_id"] == top1_id:
         job_text1 = row["processed_text"]
         print(f"Top 1 Score: {top1_score}, Job Desc: {job_text1}, User Desc: {text}")
         j += 1
      elif row["uniq_id"] == top2_id:
         job_text2 = row["processed_text"]
         print(f"Top 2 Score: {top2_score}, Job Desc: {job_text2}, User Desc: {text}")
         j += 1
      if j == 2:
         break
'''

'\nfor i in range(10):\n   top1_score = sorted_user["top1_score"][i]\n   top2_score = sorted_user["top2_score"][i]\n   top1_id = sorted_user["top1_id"][i]\n   top2_id = sorted_user["top2_id"][i]\n   text = sorted_user["processed_text"][i]\n\n   for row_num, row in sample_1_job.iterrows():\n      j = 0\n      if row["uniq_id"] == top1_id:\n         job_text1 = row["processed_text"]\n         print(f"Top 1 Score: {top1_score}, Job Desc: {job_text1}, User Desc: {text}")\n         j += 1\n      elif row["uniq_id"] == top2_id:\n         job_text2 = row["processed_text"]\n         print(f"Top 2 Score: {top2_score}, Job Desc: {job_text2}, User Desc: {text}")\n         j += 1\n      if j == 2:\n         break\n'

In [None]:
# value = sample_1_job.loc[sample_1_job['uniq_id'] == '3a087ce73ab876633f799ae6e94e0023', "Job_Features_Merged"].iloc[0]
# print(value)
# print(sample_1_user['Tech_Features_Merged'][0])