In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
### Natural Language Processing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
user_data = pd.read_csv("processed_users.csv")
job_data = pd.read_csv("processed_jobs.csv")

In [3]:
tfidf = TfidfVectorizer(stop_words="english", max_features=412)
tfidf_scores_user = tfidf.fit_transform(user_data["processed_text"])
print(tfidf_scores_user.shape)
tfidf_scores_job = tfidf.fit_transform(job_data["processed_text"])
print(tfidf_scores_job.shape)

(50000, 412)
(22000, 412)


In [4]:
print((np.array(tfidf_scores_user[0])))
print(cosine_similarity((tfidf_scores_job[0]), (tfidf_scores_user[50])))
user_data["embedScore"] = 0
for i in range(tfidf_scores_user.shape[0]):
    user_data["embedScore"][i] = (tfidf_scores_user[i])
job_data["embedScore"] = 0
for i in range(tfidf_scores_job.shape[0]):
    job_data["embedScore"][i] = (tfidf_scores_job[i])

  (0, 270)	0.29896684465454776
  (0, 182)	0.18761247975805795
  (0, 162)	0.2412128772892128
  (0, 84)	0.3656047259658325
  (0, 120)	0.46715632738385837
  (0, 73)	0.12351607935001294
  (0, 361)	0.1053707709406454
  (0, 205)	0.24169395998826512
  (0, 167)	0.32454052216383394
  (0, 28)	0.15518672716324416
  (0, 350)	0.15841787565339954
  (0, 402)	0.15841787565339954
  (0, 14)	0.15841787565339954
  (0, 351)	0.3709596936718242
  (0, 40)	0.19557723425470563
[[0.02226211]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data["embedScore"][i] = (tfidf_scores_user[i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  job_data["embedScore"][i] = (tfidf_scores_job[i])


In [5]:
print(type(np.array(user_data["embedScore"])))

<class 'numpy.ndarray'>


In [6]:
sample_1_user = user_data.sample(n=2000, replace=True, random_state=0).reset_index(drop=True)
sample_1_job = job_data.sample(n=2000, replace=True, random_state=0).reset_index(drop=True)

In [7]:
user_embeddings = np.stack([(embed.toarray())[0] for embed in sample_1_user['embedScore']])
job_embeddings = np.stack([(embed.toarray())[0] for embed in sample_1_job['embedScore']])
print(user_embeddings.shape)
print(user_embeddings)

(2000, 412)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.21563854 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [8]:
'''user_embeddings = np.stack([(embed.toarray())[0] for embed in user_data['embedScore']])
job_embeddings = np.stack([(embed.toarray())[0] for embed in job_data['embedScore']])
print(user_embeddings.shape)
print(user_embeddings)'''

(50000, 412)
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.27807729 0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.13049072 0.        ]]


In [8]:
similarity_matrix = cosine_similarity(user_embeddings, job_embeddings)

In [9]:
print(similarity_matrix)

[[0.02188591 0.04523697 0.01302367 ... 0.06218721 0.05332512 0.05334182]
 [0.         0.         0.03214097 ... 0.         0.         0.        ]
 [0.03600192 0.01555996 0.05480425 ... 0.05045725 0.03368119 0.04096359]
 ...
 [0.         0.03559064 0.0404162  ... 0.06257632 0.02947435 0.09332763]
 [0.         0.02937003 0.0279402  ... 0.06752486 0.04798037 0.        ]
 [0.02712491 0.01718636 0.         ... 0.         0.02717231 0.        ]]


In [10]:
for label in ['top1_id', 'top1_score', 'top2_id', 'top2_score', 'bottom1_id', 'bottom1_score', 'bottom2_id', 'bottom2_score']:
    user_data[label] = None

# Iterate through each user to find top 2 and bottom 2 jobs based on similarity
for i, similarities in enumerate(similarity_matrix):
    # Get indices of jobs sorted by similarity for this user
    sorted_indices = np.argsort(similarities)

    # Top 2 - highest similarity scores
    top_indices = sorted_indices[-2:][::-1]  # Reverse to have top1 before top2
    user_data.at[i, 'top1_id'], user_data.at[i, 'top2_id'] = job_data.iloc[top_indices]['uniq_id'].values
    user_data.at[i, 'top1_score'], user_data.at[i, 'top2_score'] = similarities[top_indices]

    # Bottom 2 - lowest similarity scores
    bottom_indices = sorted_indices[:2]  # Already in ascending order
    user_data.at[i, 'bottom1_id'], user_data.at[i, 'bottom2_id'] = job_data.iloc[bottom_indices]['uniq_id'].values
    user_data.at[i, 'bottom1_score'], user_data.at[i, 'bottom2_score'] = similarities[bottom_indices]


In [11]:
print(user_data.head(10))


   ResponseId     Q120                      MainBranch                 Age  \
0       22607  I agree  I am a developer by profession     35-44 years old   
1        5640  I agree  I am a developer by profession     25-34 years old   
2        1154  I agree  I am a developer by profession     55-64 years old   
3       76622  I agree           I am learning to code  Under 18 years old   
4        8538  I agree  I am a developer by profession     35-44 years old   
5        4560  I agree  I am a developer by profession     35-44 years old   
6       33635  I agree  I am a developer by profession     18-24 years old   
7       58490  I agree  I am a developer by profession     25-34 years old   
8       16570  I agree  I am a developer by profession     35-44 years old   
9       86356  I agree  I am a developer by profession     25-34 years old   

                                          Employment  \
0                                Employed, full-time   
1                            