In [41]:
#  Link for csv file
#  https://www.kaggle.com/datasets/PromptCloudHQ/us-technology-jobs-on-dicecom

In [42]:
import pandas as pd
import numpy as np

In [43]:
df_jobs = pd.read_csv("./jobs_data.csv")
print("Jobs shape:", df_jobs.shape)
df_jobs.info()

Jobs shape: (21957, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21957 entries, 0 to 21956
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   advertiserurl             21957 non-null  object
 1   company                   21907 non-null  object
 2   employmenttype_jobstatus  21727 non-null  object
 3   jobdescription            21957 non-null  object
 4   jobid                     21957 non-null  object
 5   joblocation_address       21954 non-null  object
 6   jobtitle                  21957 non-null  object
 7   postdate                  21957 non-null  object
 8   shift                     21637 non-null  object
 9   site_name                 3484 non-null   object
 10  skills                    21957 non-null  object
 11  uniq_id                   21957 non-null  object
dtypes: object(12)
memory usage: 2.0+ MB


In [44]:
import re

# Title should contain only characters, spaces and '+' (for C++)
def clean_job_title(title):
    return re.sub(r'[^a-zA-Z\s\+]', ' ', title).lower()

def clean_job_description(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

def clean_skill(skill):
    return skill.replace(',', '').lower()

print(clean_job_title("Sr. BAckend DevlopER"))
print(clean_job_description("I have skills in developing and maintaining software applications using Python, Java, and JavaScript."))
print(clean_skill('HTML, CSS, JavaScript, SQL'))

sr  backend devloper
i have skills in developing and maintaining software applications using python  java  and javascript 
html css javascript sql


In [45]:
# Clean jobtitle, jobdescription and skills
df_jobs["clean_title"] = df_jobs["jobtitle"].apply(clean_job_title)
df_jobs['clean_jobdescription'] = df_jobs['jobdescription'].apply(clean_job_description)
df_jobs['clean_skills'] = df_jobs['skills'].apply(clean_skill)

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [47]:
# Initialize the TfidfVectorizer
title_vectorizer = CountVectorizer()
description_vectorizer = TfidfVectorizer(stop_words='english')
skills_vectorizer = CountVectorizer(ngram_range=(1, 3))


# fit_transform the vectorizers and create tfidf matrix
title_matrix = title_vectorizer.fit_transform(df_jobs['clean_title'])
description_matrix = description_vectorizer.fit_transform(df_jobs['clean_jobdescription'])
skills_matrix = skills_vectorizer.fit_transform(df_jobs['clean_skills'])

In [48]:

def get_recommendations(title, description, skills):
    # Clean title
    title = clean_job_title(title)
    # Clean description
    if skills:
        description = ' '.join(skills) + ' ' + description
    if title:
        description = f'{title} {description}'
    description = clean_job_description(description)
    # Clean skills
    skills = clean_skill(skills)

    # Compute vectorizer
    query_title_vec = title_vectorizer.transform([title])
    query_description_vec = description_vectorizer.transform([description])
    query_skills_vec = skills_vectorizer.transform([skills])

    # Compute cosine similarity
    cosine_sim_title = cosine_similarity(query_title_vec, title_matrix)
    cosine_sim_description = cosine_similarity(query_description_vec, description_matrix)
    cosine_sim_skills = cosine_similarity(query_skills_vec, skills_matrix)


    # Print the similarity scores for eact feature
    print('sim_title:', cosine_sim_title[0][np.argsort(-cosine_sim_title[0])[:10]])
    print('sim_description:', cosine_sim_description[0][np.argsort(-cosine_sim_description[0])[:10]])
    print('sim_skills:', cosine_sim_skills[0][np.argsort(-cosine_sim_skills[0])[:10]])
    # Print the top skills score and its corresponding skills
    # top_n_skills = np.argsort(-cosine_sim_skills[0])[:10]
    # results_skills = df_jobs.iloc[top_n_skills]
    # results_skills = results_skills.copy()
    # results_skills['score'] = cosine_sim_skills[0][top_n_skills]
    # print(results_skills[['jobtitle', 'skills', 'score']])

    # Combine the cosine similarity scores for job title, job description and skills
    weight_title = 0.4
    weight_description = 0.2
    weight_skills = 0.4
    cosine_sim_input = weight_title * cosine_sim_title + weight_description * cosine_sim_description + weight_skills * cosine_sim_skills

    # Find the indices of the top N jobs with the highest cosine similarity scores
    N = 25
    top_n_indices = np.argsort(-cosine_sim_input[0])[:N]

    # Get the similarity scores of the recommended jobs
    similarity_scores = cosine_sim_input[0][top_n_indices]
    print("Similarity Scores:", similarity_scores)
    
    # Return the top N jobs with the highest cosine similarity scores
    results = df_jobs.iloc[top_n_indices]    
    results = results.copy()
    results['score'] = similarity_scores
    return results

In [49]:
input_title = "backend"
input_description = "I have skills in developing and maintaining software applications using agile techniques, Python, and JavaScript."
input_skills = ['python', 'javascript', 'mysql']
input_skills = ' '.join(input_skills)

recommended_jobs = get_recommendations(input_title, input_description, input_skills)

sim_title: [0.70710678 0.70710678 0.70710678 0.70710678 0.70710678 0.70710678
 0.70710678 0.70710678 0.70710678 0.57735027]
sim_description: [0.2896732  0.2896732  0.25782027 0.24386415 0.22203163 0.21357695
 0.21357695 0.20918275 0.20246527 0.20211751]
sim_skills: [1.         0.5        0.5        0.47140452 0.40824829 0.40824829
 0.40824829 0.40824829 0.40824829 0.40824829]
Similarity Scores: [0.42230056 0.38566438 0.37188067 0.30489556 0.30346146 0.30337414
 0.3009239  0.30033446 0.29915857 0.29915857 0.29765274 0.28489836
 0.28489836 0.28489739 0.28481289 0.28118532 0.27777208 0.26551812
 0.26132766 0.26103204 0.25840562 0.25516337 0.25507893 0.25272862
 0.25155212]


In [50]:
recommended_jobs[['jobtitle', 'jobdescription', 'skills', 'score']].iloc[:10]

Unnamed: 0,jobtitle,jobdescription,skills,score
2984,Full Stack Engineer,Local NYC candidatesWe cannot sponsor visas at...,"Python, JavaScript, MYSQL",0.422301
20530,Backend Engineer,CS Degree (or experience at top company ie: Tw...,Python/Django,0.385664
3219,Backend Engineer,Local CandidatesMust be able to work for any U...,"Python, AWS, MYSQL building scalable systems, Git",0.371881
1699,C# Backend Engineer,Greetings!My name is Manohar and I work with W...,Software Engineer and Window Services and Asyn...,0.304896
10998,Backend Engineer,Minimum QualificationsBachelor's Degree in Com...,Contract W2,0.303461
20977,"Backend Engineer - Python, Cassandra",To Apply: Candidates should have solid experie...,"Python, Cassandra, NoSQL",0.303374
21418,Backend Python Software Engineer,You will:Architect and develop new systems for...,"Python, Django, Javascript, HTTP, Mobile, Java",0.300924
5021,C# Backend Developer,"CSI Interfusion (CSI) (size: 20,001+ employees...","C#, SQL, COSMOS, Power BI",0.300334
21607,Senior Backend Engineer,"Our client, an exciting start-up in the food &...","python, backend, django, aws, full stack, node...",0.299159
19872,Senior Backend Engineer,"Our client, an exciting start-up in the food &...","python, backend, django, aws, full stack, node...",0.299159
