In [6]:
#  Link for csv file
#  https://www.kaggle.com/datasets/PromptCloudHQ/us-technology-jobs-on-dicecom

In [7]:
import pandas as pd
import numpy as np

In [8]:
df_jobs = pd.read_csv("./jobs_data.csv")
print("Jobs shape:", df_jobs.shape)
df_jobs.info()

Jobs shape: (19674, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19674 entries, 0 to 19673
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        19674 non-null  object
 1   description  19674 non-null  object
 2   skills       19674 non-null  object
 3   company      19626 non-null  object
 4   location     19672 non-null  object
dtypes: object(5)
memory usage: 768.6+ KB


In [9]:
import re

# Title should contain only characters, spaces and '+' (for C++)
def clean_job_title(title):
    return re.sub(r'[^a-zA-Z\s\+]', ' ', title).lower()

def clean_job_description(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

def clean_skill(skill):
    return skill.replace(',', '').lower()

print(clean_job_title("Sr. BAckend DevlopER"))
print(clean_job_description("I have skills in developing and maintaining software applications using Python, Java, and JavaScript."))
print(clean_skill('HTML, CSS, JavaScript, SQL'))

sr  backend devloper
i have skills in developing and maintaining software applications using python  java  and javascript 
html css javascript sql


In [10]:
# Clean jobtitle, jobdescription and skills
df_jobs["clean_title"] = df_jobs['title'].apply(clean_job_title)
df_jobs['clean_jobdescription'] = df_jobs['description'].apply(clean_job_description)
df_jobs['clean_skills'] = df_jobs['skills'].apply(clean_skill)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
# Initialize the TfidfVectorizer
title_vectorizer = CountVectorizer()
description_vectorizer = TfidfVectorizer(stop_words='english')
skills_vectorizer = CountVectorizer(ngram_range=(1, 3))


# fit_transform the vectorizers and create tfidf matrix
title_matrix = title_vectorizer.fit_transform(df_jobs['clean_title'])
description_matrix = description_vectorizer.fit_transform(df_jobs['clean_jobdescription'])
skills_matrix = skills_vectorizer.fit_transform(df_jobs['clean_skills'])

In [13]:

def get_recommendations(title, description, skills):
    # Clean title
    title = clean_job_title(title)
    # Clean description
    if skills:
        description = ' '.join(skills) + ' ' + description
    if title:
        description = f'{title} {description}'
    description = clean_job_description(description)
    # Clean skills
    skills = clean_skill(skills)

    # Compute vectorizer
    query_title_vec = title_vectorizer.transform([title])
    query_description_vec = description_vectorizer.transform([description])
    query_skills_vec = skills_vectorizer.transform([skills])

    # Compute cosine similarity
    cosine_sim_title = cosine_similarity(query_title_vec, title_matrix)
    cosine_sim_description = cosine_similarity(query_description_vec, description_matrix)
    cosine_sim_skills = cosine_similarity(query_skills_vec, skills_matrix)


    # Print the similarity scores for eact feature
    print('sim_title:', cosine_sim_title[0][np.argsort(-cosine_sim_title[0])[:10]])
    print('sim_description:', cosine_sim_description[0][np.argsort(-cosine_sim_description[0])[:10]])
    print('sim_skills:', cosine_sim_skills[0][np.argsort(-cosine_sim_skills[0])[:10]])
    # Print the top skills score and its corresponding skills
    # top_n_skills = np.argsort(-cosine_sim_skills[0])[:10]
    # results_skills = df_jobs.iloc[top_n_skills]
    # results_skills = results_skills.copy()
    # results_skills['score'] = cosine_sim_skills[0][top_n_skills]
    # print(results_skills[['title', 'skills', 'score']])

    # Combine the cosine similarity scores for job title, job description and skills
    weight_title = 0.4
    weight_description = 0.2
    weight_skills = 0.4
    cosine_sim_input = weight_title * cosine_sim_title + weight_description * cosine_sim_description + weight_skills * cosine_sim_skills

    # Find the indices of the top N jobs with the highest cosine similarity scores
    N = 25
    top_n_indices = np.argsort(-cosine_sim_input[0])[:N]

    # Get the similarity scores of the recommended jobs
    similarity_scores = cosine_sim_input[0][top_n_indices]
    print("Similarity Scores:", similarity_scores)
    
    # Return the top N jobs with the highest cosine similarity scores
    results = df_jobs.iloc[top_n_indices]    
    results = results.copy()
    results['score'] = similarity_scores
    return results

In [14]:
input_title = "backend"
input_description = "I have skills in developing and maintaining software applications using agile techniques, Python, and JavaScript."
input_skills = ['python', 'javascript', 'mysql']
input_skills = ' '.join(input_skills)

recommended_jobs = get_recommendations(input_title, input_description, input_skills)

sim_title: [0.70710678 0.70710678 0.70710678 0.70710678 0.70710678 0.70710678
 0.70710678 0.70710678 0.57735027 0.57735027]
sim_description: [0.28956906 0.28956906 0.25747162 0.2436909  0.22220425 0.21303294
 0.21303294 0.20989322 0.20240132 0.20182185]
sim_skills: [1.         0.5        0.5        0.47140452 0.40824829 0.40824829
 0.40824829 0.40824829 0.40824829 0.40824829]
Similarity Scores: [0.42224521 0.38562109 0.37189341 0.30484849 0.30313201 0.30090427
 0.30044453 0.29904976 0.29904976 0.29762369 0.28490387 0.28490387
 0.28482766 0.2848018  0.28117024 0.27776656 0.26549785 0.26133659
 0.26079112 0.25839433 0.25517477 0.2549987  0.25157393 0.25145813
 0.24380478]


In [15]:
recommended_jobs[['title', 'description', 'skills', 'score']].iloc[:10]

Unnamed: 0,title,description,skills,score
2632,Full Stack Engineer,Local NYC candidatesWe cannot sponsor visas at...,"Python, JavaScript, MYSQL",0.422245
18369,Backend Engineer,CS Degree (or experience at top company ie: Tw...,Python/Django,0.385621
2840,Backend Engineer,Local CandidatesMust be able to work for any U...,"Python, AWS, MYSQL building scalable systems, Git",0.371893
1461,C# Backend Engineer,Greetings!My name is Manohar and I work with W...,Software Engineer and Window Services and Asyn...,0.304848
18780,"Backend Engineer - Python, Cassandra",To Apply: Candidates should have solid experie...,"Python, Cassandra, NoSQL",0.303132
19181,Backend Python Software Engineer,You will:Architect and develop new systems for...,"Python, Django, Javascript, HTTP, Mobile, Java",0.300904
4448,C# Backend Developer,"CSI Interfusion (CSI) (size: 20,001+ employees...","C#, SQL, COSMOS, Power BI",0.300445
17758,Senior Backend Engineer,"Our client, an exciting start-up in the food &...","python, backend, django, aws, full stack, node...",0.29905
19352,Senior Backend Engineer,"Our client, an exciting start-up in the food &...","python, backend, django, aws, full stack, node...",0.29905
19240,Backend Engineer,iCARS is a web and mobile application platform...,"node.js, mongodb, strongloop, loopback",0.297624
