In [1]:
# Steps of the job crawler component

'''
Input: Job title keywords, location keywords, more to add in the future

Output: A list of job postings feeds, format to be determined

clean:
    remove duplicates (a job may be posted on multiple sites)
    remove unrelated jobs (feed may contain jobs that are not related to the keywords, we need a classifier)
    put the feed into a suitable format (json, with fields like title, company, location, description, url, etc.)

store:
    store the feed in a txt file in preliminary version

At end of the program, it should consolidate all the feeds and store them in jason format, report with number of jobs found, 
and save the feeds in a txt file

'''

'\nInput: Job title keywords, location keywords, more to add in the future\n\nOutput: A list of job postings feeds, format to be determined\n\nclean:\n    remove duplicates (a job may be posted on multiple sites)\n    remove unrelated jobs (feed may contain jobs that are not related to the keywords, we need a classifier)\n    put the feed into a suitable format (json, with fields like title, company, location, description, url, etc.)\n\nstore:\n    store the feed in a txt file in preliminary version\n\nAt end of the program, it should consolidate all the feeds and store them in jason format, report with number of jobs found, \nand save the feeds in a txt file\n\n'

In [2]:
### Import libraries
import pandas as pd

In [3]:
job_dataset_path = '../../../Data/job_listing_kaggle_dataset/job_postings.csv'
job_dataset = pd.read_csv(job_dataset_path)
print(job_dataset.head())

       job_id  company_id                                              title  \
0  3757940104    553718.0                              Hearing Care Provider   
1  3757940025   2192142.0  Shipping & Receiving Associate 2nd shift (Beav...   
2  3757938019    474443.0                               Manager, Engineering   
3  3757938018  18213359.0                                               Cook   
4  3757937095    437225.0        Principal Cloud Security Architect (Remote)   

                                         description  max_salary  med_salary  \
0  Overview\n\nHearingLife is a national hearing ...         NaN     5250.00   
1  Metalcraft of Mayville\nMetalcraft of Mayville...         NaN         NaN   
2  \nThe TSUBAKI name is synonymous with excellen...         NaN         NaN   
3  descriptionTitle\n\n Looking for a great oppor...         NaN       22.27   
4  Job Summary\nAt iHerb, we are on a mission to ...    275834.0         NaN   

   min_salary pay_period formatted_wor

In [4]:
# check column heads
print(job_dataset.columns)

Index(['job_id', 'company_id', 'title', 'description', 'max_salary',
       'med_salary', 'min_salary', 'pay_period', 'formatted_work_type',
       'location', 'applies', 'original_listed_time', 'remote_allowed',
       'views', 'job_posting_url', 'application_url', 'application_type',
       'expiry', 'closed_time', 'formatted_experience_level', 'skills_desc',
       'listed_time', 'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'scraped'],
      dtype='object')


In [5]:
# check 'title' column 1st row
print(job_dataset['title'][0])

Hearing Care Provider


In [6]:
print(job_dataset['description'][0])

Overview

HearingLife is a national hearing care company and part of the Demant Group, a global leader in hearing healthcare built on a heritage of care, health, and innovation since 1904. HearingLife operates more than 600 hearing care centers across 42 states. We follow a scientific, results-oriented approach to hearing healthcare that is provided by highly skilled and caring professionals. Our vision is to help more people hear better through life-changing hearing health delivered by the best personalized care. This Team Member must uphold the HearingLife Core Values:

 We create trust  We are team players  We apply a can-do attitude  We create innovative solutions 

Responsibilities

You will help more people hear better by providing clinical expertise to diagnose and treat hearing loss while ensuring a positive patient experience. The Hearing Care Provider acts in accordance with required industry and state professional licensing standards and local practice scope and is responsib

In [7]:
# check 100th row job all info
print(job_dataset.iloc[100])

job_id                                                               3757930799
company_id                                                           78708714.0
title                                                        Imaging Supervisor
description                   Imaging Supervisor- Radiology Clinical Support...
max_salary                                                             190000.0
med_salary                                                                  NaN
min_salary                                                             100000.0
pay_period                                                               YEARLY
formatted_work_type                                                   Full-time
location                                                             Orange, CA
applies                                                                     2.0
original_listed_time                                            1699080000000.0
remote_allowed                          

In [34]:
# important columns: title, description, location, 

In [8]:
# check location

import re
# Define the regex pattern to match "city, state" or "United States"
pattern = re.compile(r"[a-zA-Z\s]*,[A-Z]*|[Uu]nited [Ss]tates")

# Create a new column to store the cleaned location
job_dataset['cleaned_location'] = None

# create a new column to store the cleaned location
for i in range(len(job_dataset)):
    location = job_dataset['location'][i]
    if not pattern.match(location):
        location = 'United States'
    
    # create a new column to store the cleaned location
    job_dataset.loc[i, 'cleaned_location'] = location

In [9]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from collections import defaultdict

# Create a mapping from titles to a list of job IDs
title_to_job_ids = defaultdict(list)
for i in range(len(job_dataset)):
    title = job_dataset['title'][i]
    job_id = job_dataset['job_id'][i]
    title_to_job_ids[title].append(job_id)

# Load the pre-trained model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Assume `unique_titles` is a list of unique job titles from your dataset
unique_titles = list(set(job_dataset['title']))  # Remove duplicates

# Generate embeddings for all unique titles
title_embeddings = model.encode(unique_titles, convert_to_tensor=True)

def find_most_similar_titles(query_title, title_embeddings, unique_titles, title_to_job_ids, top_n=20):
    # Encode the query title
    query_embedding = model.encode(query_title, convert_to_tensor=True)
    
    # Compute cosine similarities
    cosine_scores = util.pytorch_cos_sim(query_embedding, title_embeddings)[0]

    # Find the highest scores
    top_indices = np.argpartition(-cosine_scores, range(top_n))[:top_n]

    # Retrieve the corresponding titles and their job IDs
    similar_titles_and_ids = [(unique_titles[index], title_to_job_ids[unique_titles[index]], cosine_scores[index].item()) for index in top_indices]

    return similar_titles_and_ids

# Example usage
query_title = "Senior Sales Manager"
similar_titles_and_ids = find_most_similar_titles(query_title, title_embeddings, unique_titles, title_to_job_ids)

# Print the results
for title, job_ids, score in similar_titles_and_ids:
    print(f"Title: {title}, Job IDs: {job_ids}, Similarity: {score}")


Title: Senior Sales Manager, Job IDs: [3757495433, 3756148719, 3699062061, 3693050755, 3693047061], Similarity: 1.0
Title: Sales Operations Senior Manager, Job IDs: [3749346422], Similarity: 0.9269579648971558
Title: Senior National Sales Manager, Job IDs: [3755597256, 3693596707], Similarity: 0.9164255857467651
Title: Senior Sales Executive, Job IDs: [3757407170], Similarity: 0.9043762683868408
Title:  Senior Sales Executive, Job IDs: [3757455012], Similarity: 0.9043762683868408
Title: Sales Manager, Job IDs: [3757935001, 3757934264, 3757934178, 3757934003, 3757933435, 3757933252, 3757932799, 3757932736, 3757932471, 3757932015, 3757931774, 3757931759, 3757931738, 3757931730, 3757931721, 3757931720, 3757931640, 3757931639, 3757931596, 3757930887, 3757930873, 3757930215, 3757930108, 3757929959, 3757929921, 3757929797, 3757929630, 3757928987, 3757928981, 3757928980, 3757928954, 3757928953, 3757928939, 3757927968, 3757927949, 3757927940, 3757927855, 3757927838, 3757927797, 3757924400, 375