In [6]:
import numpy as np
from collections import Counter

class TfidfVectorizer:
    def __init__(self, stop_words=None):
        self.stop_words = set(stop_words) if stop_words else set()
        self.vocabulary_ = {}
        self.idf_ = []

    def fit_transform(self, documents):
        self.fit(documents)
        return self.transform(documents)

    def fit(self, documents):
        # Get the number of documents
        N = len(documents)
        # Initialize the vocabulary
        vocabulary = {}
        # Initialize the document frequency dictionary
        df = {}
        # Loop through each document
        for document in documents:
            # Tokenize the document
            tokens = document.split()
            # Remove stop words
            tokens = [token for token in tokens if token not in self.stop_words]
            # Get the word count in the document
            word_counts = Counter(tokens)
            # Update the vocabulary
            for word, count in word_counts.items():
                if word not in vocabulary:
                    vocabulary[word] = len(vocabulary)
                    df[word] = 0
                df[word] += 1
        # Compute the idf values
        self.idf_ = [np.log(N / df[word]) for word in vocabulary]
        self.vocabulary_ = vocabulary
        return self

    def transform(self, documents):
        # Get the number of documents
        N = len(documents)
        # Get the number of words in the vocabulary
        M = len(self.vocabulary_)
        # Initialize the tf-idf matrix
        tfidf_matrix = np.zeros((N, M))
        # Loop through each document
        for i, document in enumerate(documents):
            # Tokenize the document
            tokens = document.split()
            # Remove stop words
            tokens = [token for token in tokens if token not in self.stop_words]
            # Get the word count in the document
            word_counts = Counter(tokens)
            # Loop through each word in the document
            for word, count in word_counts.items():
                j = self.vocabulary_.get(word, -1)
                if j >= 0:
                    tf = count / len(tokens)
                    tfidf_matrix[i, j] = tf * self.idf_[j]
        return tfidf_matrix

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [7]:
import pandas as pd

df_jobs = pd.read_csv("./jobs_data.csv")

In [8]:
import re

# Title should contain only characters, spaces and '+' (for C++)
def clean_job_title(title):
    return re.sub(r'[^a-zA-Z\s\+]', ' ', title).lower()

def clean_job_description(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

def clean_skill(skill):
    return skill.replace(',', '').lower()

print(clean_job_title("Sr. BAckend DevlopER"))
print(clean_job_description("I have skills in developing and maintaining software applications using Python, Java, and JavaScript."))
print(clean_skill('HTML, CSS, JavaScript, SQL'))

sr  backend devloper
i have skills in developing and maintaining software applications using python  java  and javascript 
html css javascript sql


In [9]:
# Clean jobtitle and jobdescription
df_jobs["clean_title"] = df_jobs["title"].apply(clean_job_title)
df_jobs['clean_jobdescription'] = df_jobs['description'].apply(clean_job_description)
df_jobs['clean_skills'] = df_jobs['skills'].apply(clean_skill)
df_jobs = df_jobs.sample(n=2000)

In [10]:
english_stopwords = ''
# open the file in read mode
with open('stopwords.txt', 'r') as file:
    # read the contents of the file into a string variable
    english_stopwords = file.read().split()

In [11]:
# Initialize the TfidfVectorizer
title_vectorizer = TfidfVectorizer()
description_vectorizer = TfidfVectorizer(stop_words=english_stopwords)
skills_vectorizer = TfidfVectorizer()

# Fit and transform the TfidfVectorizer on the documents
title_matrix = title_vectorizer.fit_transform(df_jobs['clean_title'])
description_matrix = description_vectorizer.fit_transform(df_jobs['clean_jobdescription'])
skills_matrix = skills_vectorizer.fit_transform(df_jobs['clean_skills'])

# # Convert the skills column to a list of strings
# # Remove ',' and convert to lowercase
# skills_list = df_jobs['skills'].apply(lambda x: x.replace(',', '').lower()).tolist()
# # Fit the vectorizer on the list of strings
# skills_matrix = skills_vectorizer.fit_transform(skills_list)

In [12]:

def get_recommendations(title, description, skills):
    # Clean title
    title = clean_job_title(title)
    # Clean description
    if skills:
        description = ' '.join(skills) + ' ' + description
    if title:
        description = f'{title} {description}'
    description = clean_job_description(description)
    # Clean skills
    skills = clean_skill(skills)

    # Compute vectorizer
    query_title_vec = title_vectorizer.transform([title]).ravel()
    query_description_vec = description_vectorizer.transform([description]).ravel()
    query_skills_vec = skills_vectorizer.transform([skills]).ravel()

    # Compute cosine similarity
    cosine_sim_description = [cosine_similarity(query_description_vec, description_vec) for description_vec in description_matrix]
    cosine_sim_title = [cosine_similarity(query_title_vec, title_vec) for title_vec in title_matrix]
    cosine_sim_skills = [cosine_similarity(query_skills_vec, skills_vec) for skills_vec in skills_matrix]

    cosine_sim_title = np.array(cosine_sim_title)
    cosine_sim_description = np.array(cosine_sim_description)
    cosine_sim_skills = np.array(cosine_sim_skills)

    # Combine the weighted overall cosine similarity
    weight_title = 0.4
    weight_description = 0.2
    weight_skills = 0.4
    cosine_sim_input = weight_title * cosine_sim_title + weight_description * cosine_sim_description + weight_skills * cosine_sim_skills

    # Print the similarity scores for eact feature
    print('sim_title:', cosine_sim_title[np.argsort(-cosine_sim_title)[:10]])
    print('sim_description:', cosine_sim_description[np.argsort(-cosine_sim_description)[:10]])
    print('sim_skills:', cosine_sim_skills[np.argsort(-cosine_sim_skills)[:10]])
    # top_n_skills = np.argsort(-cosine_sim_skills)[:10]
    # results_skills = df_jobs.iloc[top_n_skills]
    # results_skills = results_skills.copy()
    # results_skills['score'] = cosine_sim_skills[top_n_skills]
    # print(results_skills[['jobtitle', 'skills', 'score']])

    scores = np.array(cosine_sim_input)
    top_n_indices = np.argsort(-scores)[:10]

    results = df_jobs.iloc[top_n_indices]
    results = results.copy()
    results['score'] = scores[top_n_indices]

    return results

In [14]:
input_title = "backend"
input_description = "I have skills in developing and maintaining software applications using agile techniques, Python, and JavaScript."
input_skills = 'Python, Javascript'

recommended_jobs = get_recommendations(input_title, input_description, input_skills)
recommended_jobs[['title', 'description', 'skills', 'score']]

sim_title: [0.88007108 0.86167192 0.81499487 0.69546764 0.67906926 0.53052941
 0.47292568 0.46464313 0.43281338 0.32274205]
sim_description: [0.27259833 0.1937742  0.19062148 0.17773109 0.17091218 0.16672948
 0.15266197 0.14533316 0.14429456 0.14054417]
sim_skills: [0.79026159 0.66955433 0.45361225 0.4480892  0.4360155  0.42996656
 0.41161806 0.40357192 0.40279269 0.39782499]


Unnamed: 0,title,description,skills,score
19181,Backend Python Software Engineer,You will:Architect and develop new systems for...,"Python, Django, Javascript, HTTP, Mobile, Java",0.461447
17758,Senior Backend Engineer,"Our client, an exciting start-up in the food &...","python, backend, django, aws, full stack, node...",0.435521
18048,"Senior Software Engineer, Backend","A San Francisco-based Series A, backed by top ...","Python, C++, AWS, Thrift/Docker",0.421656
2665,Lead Python Backend Engineer,Highly successful/ heavily funded Innovative S...,"Python, Lead, Start up experience, drive platf...",0.380458
18780,"Backend Engineer - Python, Cassandra",To Apply: Candidates should have solid experie...,"Python, Cassandra, NoSQL",0.36628
14937,Java Backend Engineer,"""U.S. Citizens and those authorized to work in...","Backend engineering , in java/rails/JavaScript...",0.355229
3490,Senior Python Engineer,Сurrently we are looking for Sr. Python Softwa...,python javascript web,0.33913
13424,Javascript Developer,We're looking for Javascript Developers with A...,JavaScript,0.278357
4405,Sr. Software Engineer (Graph + Mobile Backend),"I’m working with a profitable, growing company...","Scala, Python, Java, backend, Hadoop, mobile",0.262935
4049,Contract Backend PHP/Wordpress Developer,"Minimum Required Skills:PHP, JavaScript, WordP...","PHP, JavaScript, WordPress, RESTful APIs, HTML...",0.238818


In [17]:
input_title = df_jobs.iloc[0]['title']
input_description  = df_jobs.iloc[0]['description']
input_skills = 'javascript, angular.js, react, node.js'

recommended_jobs = get_recommendations(input_title, input_description, input_skills)
recommended_jobs[['title', 'description', 'skills', 'score']]

sim_title: [1.         0.62807845 0.61799837 0.5998575  0.56941324 0.54114263
 0.5340882  0.5110051  0.49697718 0.49057299]
sim_description: [0.99659122 0.32055064 0.27314154 0.27036825 0.26671271 0.25053429
 0.24955102 0.24955102 0.24955102 0.24792166]
sim_skills: [0.68177416 0.45263611 0.38331787 0.35625577 0.35374502 0.34683016
 0.32496877 0.3232616  0.31926764 0.28329288]


Unnamed: 0,title,description,skills,score
14048,IT Security Manager - BHJOB2052_12784,Must be authorized to work in the U.S./ Sponso...,"IT Security Management, Security Analysis, fir...",0.599318
7646,Senior Front End Developer - JavaScript,"Minimum Required Skills:JavaScript, Node.JS, H...","JavaScript, Node.JS, HTML, CSS, REACT - JavaSc...",0.280341
6108,Project Manager - BHJOB2052_12330,Must be authorized to work in the U.S./ Sponso...,"project manager, agile, sdlc",0.276215
9974,IT Senior Security Analyst,Job Summary.The TMNAS IT Senior Security Analy...,"vulnerability management, security logs, CISSP...",0.26936
18785,Senior IT security,Skill / Job Requirements:Competency Requiremen...,IT Security,0.263933
5259,Java Developer - BHJOB2052_12041,Java Developer (move into a Lead role) Awesome...,"java, j2ee, ui",0.25436
12533,IT Manager,We are seeking an IT Manager for a Seattle are...,"10+ years IT experience, Background in infrast...",0.248237
5971,.NET Developer - BHJOB2052_12539,Must be authorized to work in the U.S./ Sponso...,"c#, asp.net, front end development, javascript...",0.244995
15608,IT Security Delivery Manager/Project Manager,Position :- IT Security Delivery Manager/Proje...,IT Security Delivery Manager/Project Manager,0.230254
11865,Security Engineer,Our client is seeking a Security Engineer with...,"Proxy, Security, Network, encryption, tokeniza...",0.221848
