In [1]:
import numpy as np
from collections import Counter

class TfidfVectorizer:
    def __init__(self, stop_words=None):
        self.stop_words = set(stop_words) if stop_words else set()
        self.vocabulary_ = {}
        self.idf_ = []

    def fit_transform(self, documents):
        self.fit(documents)
        return self.transform(documents)

    def fit(self, documents):
        # Get the number of documents
        N = len(documents)
        # Initialize the vocabulary
        vocabulary = {}
        # Initialize the document frequency dictionary
        df = {}
        # Loop through each document
        for document in documents:
            # Tokenize the document
            tokens = document.split()
            # Remove stop words
            tokens = [token for token in tokens if token not in self.stop_words]
            # Get the word count in the document
            word_counts = Counter(tokens)
            # Update the vocabulary
            for word, count in word_counts.items():
                if word not in vocabulary:
                    vocabulary[word] = len(vocabulary)
                    df[word] = 0
                df[word] += 1
        # Compute the idf values
        self.idf_ = [np.log(N / df[word]) for word in vocabulary]
        self.vocabulary_ = vocabulary
        return self

    def transform(self, documents):
        # Get the number of documents
        N = len(documents)
        # Get the number of words in the vocabulary
        M = len(self.vocabulary_)
        # Initialize the tf-idf matrix
        tfidf_matrix = np.zeros((N, M))
        # Loop through each document
        for i, document in enumerate(documents):
            # Tokenize the document
            tokens = document.split()
            # Remove stop words
            tokens = [token for token in tokens if token not in self.stop_words]
            # Get the word count in the document
            word_counts = Counter(tokens)
            # Loop through each word in the document
            for word, count in word_counts.items():
                j = self.vocabulary_.get(word, -1)
                if j >= 0:
                    tf = count / len(tokens)
                    tfidf_matrix[i, j] = tf * self.idf_[j]
        return tfidf_matrix

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [2]:
import pandas as pd

df_jobs = pd.read_csv("./jobs_data.csv")

In [3]:
import re

# Title should contain only characters, spaces and '+' (for C++)
def clean_job_title(title):
    return re.sub(r'[^a-zA-Z\s\+]', ' ', title).lower()

def clean_job_description(text):
    # Remove punctuation and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

def clean_skill(skill):
    return skill.replace(',', '').lower()

print(clean_job_title("Sr. BAckend DevlopER"))
print(clean_job_description("I have skills in developing and maintaining software applications using Python, Java, and JavaScript."))
print(clean_skill('HTML, CSS, JavaScript, SQL'))

sr  backend devloper
i have skills in developing and maintaining software applications using python  java  and javascript 
html css javascript sql


In [4]:
# Clean jobtitle and jobdescription
df_jobs["clean_title"] = df_jobs["title"].apply(clean_job_title)
df_jobs['clean_jobdescription'] = df_jobs['description'].apply(clean_job_description)
df_jobs['clean_skills'] = df_jobs['skills'].apply(clean_skill)
df_jobs = df_jobs.sample(n=2000)

In [5]:
english_stopwords = ''
# open the file in read mode
with open('stopwords.txt', 'r') as file:
    # read the contents of the file into a string variable
    english_stopwords = file.read().split()

In [6]:
# Initialize the TfidfVectorizer
title_vectorizer = TfidfVectorizer()
description_vectorizer = TfidfVectorizer(stop_words=english_stopwords)
skills_vectorizer = TfidfVectorizer()

# Fit and transform the TfidfVectorizer on the documents
title_matrix = title_vectorizer.fit_transform(df_jobs['clean_title'])
description_matrix = description_vectorizer.fit_transform(df_jobs['clean_jobdescription'])
skills_matrix = skills_vectorizer.fit_transform(df_jobs['clean_skills'])

# # Convert the skills column to a list of strings
# # Remove ',' and convert to lowercase
# skills_list = df_jobs['skills'].apply(lambda x: x.replace(',', '').lower()).tolist()
# # Fit the vectorizer on the list of strings
# skills_matrix = skills_vectorizer.fit_transform(skills_list)

In [7]:

def get_recommendations(title, description, skills):
    # Clean title
    title = clean_job_title(title)
    # Clean description
    if skills:
        description = ' '.join(skills) + ' ' + description
    if title:
        description = f'{title} {description}'
    description = clean_job_description(description)
    # Clean skills
    skills = clean_skill(skills)

    # Compute vectorizer
    query_title_vec = title_vectorizer.transform([title]).ravel()
    query_description_vec = description_vectorizer.transform([description]).ravel()
    query_skills_vec = skills_vectorizer.transform([skills]).ravel()

    # Compute cosine similarity
    cosine_sim_description = [cosine_similarity(query_description_vec, description_vec) for description_vec in description_matrix]
    cosine_sim_title = [cosine_similarity(query_title_vec, title_vec) for title_vec in title_matrix]
    cosine_sim_skills = [cosine_similarity(query_skills_vec, skills_vec) for skills_vec in skills_matrix]

    cosine_sim_title = np.array(cosine_sim_title)
    cosine_sim_description = np.array(cosine_sim_description)
    cosine_sim_skills = np.array(cosine_sim_skills)

    # Combine the weighted overall cosine similarity
    weight_title = 0.4
    weight_description = 0.2
    weight_skills = 0.4
    cosine_sim_input = weight_title * cosine_sim_title + weight_description * cosine_sim_description + weight_skills * cosine_sim_skills

    # Print the similarity scores for eact feature
    print('sim_title:', cosine_sim_title[np.argsort(-cosine_sim_title)[:10]])
    print('sim_description:', cosine_sim_description[np.argsort(-cosine_sim_description)[:10]])
    print('sim_skills:', cosine_sim_skills[np.argsort(-cosine_sim_skills)[:10]])
    # top_n_skills = np.argsort(-cosine_sim_skills)[:10]
    # results_skills = df_jobs.iloc[top_n_skills]
    # results_skills = results_skills.copy()
    # results_skills['score'] = cosine_sim_skills[top_n_skills]
    # print(results_skills[['jobtitle', 'skills', 'score']])

    scores = np.array(cosine_sim_input)
    top_n_indices = np.argsort(-scores)[:10]

    results = df_jobs.iloc[top_n_indices]
    results = results.copy()
    results['score'] = scores[top_n_indices]

    return results

In [8]:
input_title = "backend"
input_description = "I have skills in developing and maintaining software applications using agile techniques, Python, and JavaScript."
input_skills = 'Python, Javascript'

recommended_jobs = get_recommendations(input_title, input_description, input_skills)
recommended_jobs[['title', 'description', 'skills', 'score']]

sim_title: [0.84066203 0.83155405 0.62239272 0.58116281 0.52240732 0.
 0.         0.         0.         0.        ]
sim_description: [0.1512116  0.15096969 0.14619552 0.14257341 0.13547382 0.1339545
 0.13182309 0.12720989 0.12254433 0.11652909]
sim_skills: [0.64461179 0.58429387 0.43816618 0.43170006 0.41032012 0.39733528
 0.37278548 0.36727089 0.34705111 0.34297658]


Unnamed: 0,title,description,skills,score
18201,Sr. Software Engineer (Backend),We have Job Opportunity for Sr Software Engine...,"JAVA, PYTHON, NoSQL, JSON, REST API, server side",0.431903
14547,Senior Software Engineer - Backend,"Senior Software Engineer BackendSeattle, WADow...","SaaS, Java, Hadoop, Senior, Linux",0.339171
2126,JavaScript Developer,Currently we are looking for a JavaScript Deve...,JavaScript,0.278323
501,SAP BW Backend Architect,SAP BW Backend ArchitectOpen to travelers. ...,Architect/Developer role will be responsible f...,0.259592
4049,Contract Backend PHP/Wordpress Developer,"Minimum Required Skills:PHP, JavaScript, WordP...","PHP, JavaScript, WordPress, RESTful APIs, HTML...",0.256402
12960,Database Testing ( Backend Tester),Role : Database Testing ( Backend Tester)Locat...,Test,0.249762
8791,"Junior Developer - Hedge Fund - Python, AWS, J...",The CTO of a mid-sized hedge fund is seeking a...,"Software developer, Python, Javascript, AWS",0.242149
8263,"Django Developer - Python, API, Flask, Django","Minimum Required Skills:Python, MySQL, API, Ja...","Python, MySQL, API, JavaScript, Django - Pytho...",0.190459
15889,Java / Python Developer,"Java/Python DeveloperOverview:Since 1994, HMB ...","Python, Java, .NET, Ruby, SQL, Django, Python ...",0.188808
384,"Principal Software Engineer - JavaScript, Pyth...","Minimum Required Skills:JavaScript, Python, Ru...","JavaScript, Python, Ruby on Rails, AWS - JavaS...",0.177167


In [9]:
input_title = df_jobs.iloc[0]['title']
input_description  = df_jobs.iloc[0]['description']
input_skills = 'javascript, angular.js, react, node.js'

recommended_jobs = get_recommendations(input_title, input_description, input_skills)
recommended_jobs[['title', 'description', 'skills', 'score']]

sim_title: [1.         1.         1.         0.76508264 0.76061532 0.76061532
 0.76061532 0.76061532 0.76061532 0.76061532]
sim_description: [0.99819279 0.99819279 0.37223903 0.35448053 0.35216452 0.31422885
 0.31422885 0.31422885 0.30908141 0.29979894]
sim_skills: [1.         0.65477099 0.59857878 0.58591747 0.38641425 0.34828286
 0.34523591 0.34044998 0.32663496 0.32396231]


Unnamed: 0,title,description,skills,score
3295,NETWORK & SECURITY ENGINEER,Network & Security EngineerThe Walker Group pr...,5+ years' technical exp in multiple client env...,0.599639
14084,NETWORK & SECURITY ENGINEER,Network & Security EngineerThe Walker Group pr...,5+ years' technical exp in multiple client env...,0.599639
17975,Network Security Engineer,The Network Security Engineer is responsible f...,"Security, Network, Engineer, SANS, GSEC, GCIH,...",0.470433
40,Frontend/UI Developer,PLEASE JOIN OUR TALENT NETWORK: http://bit.ly/...,"Angular.JS, React, Node.js, JavaScript",0.414846
12181,Security Engineer,Our client is seeking a Security Engineer with...,"Proxy, Security, Network, encryption, tokeniza...",0.357225
12126,Network Engineer,"My client located in King of Prussia, PA is lo...","Cisco, Solarwinds, and Aruba",0.350451
11975,Security Engineer,Information Security Engineer Position Summary...,PCI-DSS/Security/ISO 27001/Cloud,0.349174
2010,Network Engineer,Job Title: Network EngineerLocation: Columbia ...,"Network Engineer, LAN, VLAN, VPN, Virtualizati...",0.345293
19661,Network Engineer,Our client is looking for a Network Engineer t...,"Firewalls, load balancing, routing, switching,...",0.344458
7665,Network Engineer,The speed of technology. The influx of data. S...,"Architecture, CCNP, Circuits, Cisco, DNS, Gene...",0.338867
