# Job Recommendation System

In [1]:
#  Link for csv file
#  https://www.kaggle.com/datasets/PromptCloudHQ/us-technology-jobs-on-dicecom

In [2]:
import pandas as pd
import numpy as np

In [3]:
df_jobs = pd.read_csv("./jobs_data.csv")
print("Jobs shape:", df_jobs.shape)
df_jobs.info()

Jobs shape: (19674, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19674 entries, 0 to 19673
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        19674 non-null  object
 1   description  19674 non-null  object
 2   skills       19674 non-null  object
 3   company      19626 non-null  object
 4   location     19672 non-null  object
dtypes: object(5)
memory usage: 768.6+ KB


In [4]:
import re

# Title should contain only characters, spaces and '+' (for C++)
def clean_job_title(title):
    # Include only characters and '+'
    text = re.sub('[^a-zA-Z\+]', ' ', title)
    # Remove extra blank spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

def clean_job_description(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Remove extra blank spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    return text

def clean_skill(skill):
    return skill.replace(',', '').lower()

print(clean_job_title("Sr. BAckend DevlopER"))
print(clean_job_description("I have skills in developing applications using Python3, React.js, and JavaScript.      I can use Java, express.JS!"))
print(clean_skill('HTML, CSS, JavaScript, SQL'))

sr backend devloper
i have skills in developing applications using python react js and javascript i can use java express js
html css javascript sql


In [5]:
# Clean jobtitle, jobdescription and skills
df_jobs["clean_title"] = df_jobs['title'].apply(clean_job_title)
df_jobs['clean_jobdescription'] = df_jobs['description'].apply(clean_job_description)
df_jobs['clean_skills'] = df_jobs['skills'].apply(clean_skill)

## TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Initialize the TfidfVectorizer
title_vectorizer = CountVectorizer()
description_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.01)
skills_vectorizer = CountVectorizer(ngram_range=(1, 3))


# fit_transform the vectorizers and create tfidf matrix
title_matrix = title_vectorizer.fit_transform(df_jobs['clean_title'])
description_matrix = description_vectorizer.fit_transform(df_jobs['clean_jobdescription'])
skills_matrix = skills_vectorizer.fit_transform(df_jobs['clean_skills'])

## BERT

In [8]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained sentence transformer model
MODEL_NAME = 'all-MiniLM-L6-v2'
model = SentenceTransformer(MODEL_NAME)

In [9]:
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import pickle

def get_sentence_transformer_model_score(input_description, model_name, model):
    print(f'\n Model={model_name} \n Embedding-Dimensions={model.get_sentence_embedding_dimension()}')

    input_embeddings = model.encode(input_description)   

    # Try to load existing job_embeddings from pickle file
    PICKLE_FILE_PATH = 'job_embeddings.pkl'
    try: 
        with open(PICKLE_FILE_PATH, 'rb') as file:
            job_embeddings = pickle.load(file)

        # Check if the dimensions match
        if job_embeddings.shape == (len(df_jobs), model.get_sentence_embedding_dimension()):
            print("Loaded job_embeddings from pickle file.")
        else:
            print("Dimensions of loaded job_embeddings do not match. Re-encoding.")
            raise FileNotFoundError
    except (FileNotFoundError):
        print("Failed to load job_embeddings. Re-encoding.")
        # job_embeddings = model.encode(df_jobs['clean_jobdescription'])

        job_embeddings = np.zeros((len(df_jobs), model.
        get_sentence_embedding_dimension()))
        for i in tqdm(range(len(df_jobs)), desc='Encoding job description texts', unit='jobs'):
            job_embeddings[i] = model.encode(df_jobs.iloc[i]['clean_jobdescription'])

        # Dump the data to a file using pickle.dump
        with open('job_embeddings.pkl', 'wb') as file:
            pickle.dump(job_embeddings, file)
    
    # Calculate cosine similarity between the job description and resumes
    cosine_similarities = cosine_similarity(job_embeddings, [input_embeddings])

    # Set negative scores to 0
    cosine_similarities[cosine_similarities < 0] = 0

    return cosine_similarities.flatten()

In [10]:
def clean_text(title, description, skills):
    # Clean title
    title = clean_job_title(title)
    # Clean description
    if skills:
        description = skills + ' ' + description
    if title:
        description = f'{title} {description}'
    description = clean_job_description(description)
    # Clean skills
    skills = clean_skill(skills)
    return title, description, skills


def compute_vectorizer_similarity(query, vectorizer, matrix):
    query_vec = vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vec, matrix)
    return cosine_sim


def compute_weighted_similarity_score(title_scores, description_scores, skills_scores, bert_scores):
    WEIGHT_TITLE = 0.4
    WEIGHT_DESCRIPTION = 0.2
    WEIGHT_SKILLS = 0.4
    cosine_sim_input = WEIGHT_TITLE * title_scores + WEIGHT_DESCRIPTION * (0.5*(description_scores+bert_scores)) + WEIGHT_SKILLS * skills_scores
    return cosine_sim_input



def get_recommendations(title, description, skills):
    title, description, skills = clean_text(title, description, skills)

    # Compute vectorizer and cosine similarity scores for job title, job description and skills
    cosine_sim_title = compute_vectorizer_similarity(title, title_vectorizer, title_matrix)
    cosine_sim_description = compute_vectorizer_similarity(description, description_vectorizer, description_matrix)
    cosine_sim_skills = compute_vectorizer_similarity(skills, skills_vectorizer, skills_matrix)

    # Compute similarity score using sentence transformer
    bert_scores = get_sentence_transformer_model_score(description, MODEL_NAME, model)


    # Combine the cosine similarity scores
    cosine_sim_input = compute_weighted_similarity_score(cosine_sim_title, cosine_sim_description, cosine_sim_skills, bert_scores)

    # Save the scores
    df_jobs['score'] = cosine_sim_input[0] 
    df_jobs['skills_score'] = cosine_sim_skills[0] 
    df_jobs['title_score'] = cosine_sim_title[0] 
    df_jobs['description_score'] = cosine_sim_description[0] 
    df_jobs['bert_score'] = bert_scores

    # Find the indices of the top N jobs with the highest cosine similarity scores
    N = 25
    top_n_indices = np.argsort(-cosine_sim_input[0])[:N]

    # Return the top N jobs with the highest cosine similarity scores
    results = df_jobs.iloc[top_n_indices]    

    # results = results.copy()
    # # Get the similarity scores of the recommended jobs
    # similarity_scores = cosine_sim_input[0][top_n_indices]
    # # results['score'] = similarity_scores

    return results

In [11]:
input_title = "backend"
input_description = "I have skills in developing and maintaining software applications using agile techniques, Python, and JavaScript."
input_skills = 'python, javascript, mysql'

recommended_jobs = get_recommendations(input_title, input_description, input_skills)


 Model=all-MiniLM-L6-v2 
 Embedding-Dimensions=384
Loaded job_embeddings from pickle file.


In [12]:
# recommended_jobs[['title', 'description', 'skills', 'score']].iloc[:10]
recommended_jobs[['title', 'description', 'skills', 'score', 'description_score', 'bert_score', 'title_score', 'skills_score']].head(10)

Unnamed: 0,title,description,skills,score,description_score,bert_score,title_score,skills_score
2632,Full Stack Engineer,Local NYC candidatesWe cannot sponsor visas at...,"Python, JavaScript, MYSQL",0.481458,0.279778,0.534807,0.0,1.0
18369,Backend Engineer,CS Degree (or experience at top company ie: Tw...,Python/Django,0.417124,0.094235,0.305769,0.707107,0.235702
2840,Backend Engineer,Local CandidatesMust be able to work for any U...,"Python, AWS, MYSQL building scalable systems, Git",0.414185,0.149158,0.394462,0.707107,0.19245
19181,Backend Python Software Engineer,You will:Architect and develop new systems for...,"Python, Django, Javascript, HTTP, Mobile, Java",0.34403,0.12726,0.469761,0.5,0.210819
18780,"Backend Engineer - Python, Cassandra",To Apply: Candidates should have solid experie...,"Python, Cassandra, NoSQL",0.335985,0.217734,0.475446,0.5,0.166667
17758,Senior Backend Engineer,"Our client, an exciting start-up in the food &...","python, backend, django, aws, full stack, node...",0.331962,0.251418,0.503766,0.57735,0.063758
19352,Senior Backend Engineer,"Our client, an exciting start-up in the food &...","python, backend, django, aws, full stack, node...",0.331962,0.251418,0.503766,0.57735,0.063758
4448,C# Backend Developer,"CSI Interfusion (CSI) (size: 20,001+ employees...","C#, SQL, COSMOS, Power BI",0.325081,0.10585,0.316535,0.707107,0.0
19240,Backend Engineer,iCARS is a web and mobile application platform...,"node.js, mongodb, strongloop, loopback",0.323438,0.118259,0.287692,0.707107,0.0
14937,Java Backend Engineer,"""U.S. Citizens and those authorized to work in...","Backend engineering , in java/rails/JavaScript...",0.320282,0.10641,0.453674,0.57735,0.083333


## tfidf and bert scores

In [13]:
df_jobs[['clean_jobdescription', 'description_score', 'bert_score', 'score']].sort_values(by='description_score', ascending=False).head(20)

Unnamed: 0,clean_jobdescription,description_score,bert_score,score
5317,this is a fulltime direct hire primarily fulls...,0.368572,0.606887,0.260845
5303,one of our clients is looking for a fullstack ...,0.350018,0.564065,0.168388
18970,looking for python programmer who will be doin...,0.342277,0.429133,0.124282
3184,description investment management web develope...,0.3398,0.493906,0.177652
9276,description investment management web develope...,0.3398,0.493906,0.177652
5175,python developer location houston txduration f...,0.337616,0.567377,0.144192
340,minimum required skills web application develo...,0.328859,0.608327,0.181201
18131,greetings from accion labs accion labs is a le...,0.327721,0.555326,0.159574
2665,highly successful heavily funded innovative st...,0.323695,0.541033,0.319806
12404,python developer someone who has python experi...,0.320762,0.457922,0.116358


In [14]:
import textwrap

wrapped_content = textwrap.fill(df_jobs.iloc[8263]['description'], 100)
print(wrapped_content)

Minimum Required Skills:Python, MySQL, API, JavaScript, DjangoWe are a well-established, Award
Winning Software Company. Where we may be passed out Start-Up years, we don't act like it! Great
Culture, work from home flexibility, beer Friday, etc. We hire great people to build great
platforms! Come Join Us!What You Need for this Position- 3+ years of of Python experience- Previous
API build experience - Experience working with Flask or Django - Previous experience building and
consuming RESTful web applications and services- Experience with SQL Databases like MySQL and
PostgreSQLWhat You Will Be Doing- Collaborate on a cross-discipline team to build scalable web
applications and services in a Agile fashion- Keep current on emerging technologies to advance our
architecture/technologies to support growth of the business- Benchmark and optimize the performance
of new and existing applications- Participate in departmental code reviews and best practice
discussionsSo, if you are an Django De

In [15]:
input_description

'I have skills in developing and maintaining software applications using agile techniques, Python, and JavaScript.'