In [8]:
from keybert import KeyBERT
import psycopg2 
import os
from dotenv import load_dotenv
import pandas as pd

load_dotenv()

host = os.getenv("DB_HOST")
db = os.getenv("DB_NAME")
user = os.getenv("DB_USER")
password = os.getenv("DB_PASSWORD")
port = os.getenv("DB_PORT")

In [2]:
try: 
    conn = psycopg2.connect(dbname=db, user=user, password=password, host=host, port=port)
    cursor = conn.cursor() 
except:
    print("Failed to connect to database. Please try again.")

cursor.execute('''SELECT x.* FROM public.jobs x
                WHERE searchterm = 'Beauty'
                limit 100''')
records = cursor.fetchall()

In [3]:
# Zip it all up into one loooong string
descriptions = list(zip(*records))[13]
descStr = ' '.join(descriptions)

kw_model = KeyBERT()
keywords = kw_model.extract_keywords(descStr)

results = kw_model.extract_keywords(descStr, keyphrase_ngram_range=(1,1), top_n=10, stop_words=None)
print(results)

keywords = list(zip(*results))[0]
print(keywords)

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

[('salons', 0.5187), ('salon', 0.5185), ('barbershop', 0.4082), ('cosmetologists', 0.3439), ('barbering', 0.3434), ('commissions', 0.3332), ('paraprofessionals', 0.3303), ('scheduling', 0.3303), ('401k', 0.3147), ('payroll', 0.3138)]
('salons', 'salon', 'barbershop', 'cosmetologists', 'barbering', 'commissions', 'paraprofessionals', 'scheduling', '401k', 'payroll')


In [5]:
def create_df(cursor_list):
    vendorids = [tuple[1] for tuple in cursor_list]
    ids = [tuple[0] for tuple in cursor_list]
    positionnames = [tuple[2] for tuple in cursor_list]
    companys = [tuple[3] for tuple in cursor_list]
    locations = [tuple[4] for tuple in cursor_list]
    searchterms = [tuple[5] for tuple in cursor_list]
    searchareas = [tuple[6] for tuple in cursor_list]
    scrapedats = [tuple[7] for tuple in cursor_list]
    createdats = [tuple[8] for tuple in cursor_list]
    postedats = [tuple[9] for tuple in cursor_list]
    salarys = [tuple[10] for tuple in cursor_list]
    benefits = [tuple[11] for tuple in cursor_list]
    requirements = [tuple[12] for tuple in cursor_list]
    descriptions = [tuple[13] for tuple in cursor_list]
    indeedlinks = [tuple[14] for tuple in cursor_list]

    df = pd.DataFrame(data = {'id': ids, 'vendorid': vendorids, 'positionname': positionnames, 'company': companys, 'location': locations, 'searchterm': searchterms, 'searcharea': searchareas, 'scrapedat': scrapedats, 'createdat': createdats, 'postedat': postedats, 'salary': salarys, 'benefits': benefits, 'requirements': requirements, 'description': descriptions, 'indeedlink': indeedlinks})

    return df

In [6]:
cursor.execute('''SELECT * FROM jobs;''')
all_job_data = cursor.fetchall()

In [9]:
df = create_df(all_job_data)

In [10]:
df.head()

Unnamed: 0,id,vendorid,positionname,company,location,searchterm,searcharea,scrapedat,createdat,postedat,salary,benefits,requirements,description,indeedlink
0,1201,4a6c4eb734bec325,Route Delivery Driver,Powerstride Battery,"Hayward, CA 94545",Driving,"San Jose, California",2023-02-04 03:17:55.334,2023-02-04 03:18:49.045522,,,,,Delivery Route Driver We are seeking a deliver...,https://www.indeed.com/company/Powerstride-Bat...
1,1202,6a8683806384c1d2,Flatbed Drivers Wanted,"Smokey Point Distributing, Inc.",California,Driving,"San Jose, California",2023-02-04 03:17:56.070,2023-02-04 03:18:49.113831,,$0.14 - $0.20 per mile,,,\n Job Description:\n \n Smokey Point Distrib...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
2,1203,4bc5451d5ce13ced,Local Delivery Driver (non-CDL)- 7am Shift,HD Supply,"San Jose, CA 95131",Driving,"San Jose, California",2023-02-04 03:17:56.442,2023-02-04 03:18:49.188570,,$25.00 - $27.50 an hour,,,\n\n \n Job Summary\n \n \n Make deliveries...,https://www.indeed.com/rc/clk?jk=4bc5451d5ce13...
3,1204,feb060c87393b5f4,OTR - Conestoga Class - A Driver!,Keep Trucking LLC,California,Driving,"San Jose, California",2023-02-04 03:17:56.457,2023-02-04 03:18:49.268109,,"$1,800 - $2,300 a week",,,\n Job Description:\n \n Earn One Of The High...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...
4,1205,e0202ef963f325da,Delivery Driver - Full-Time,NAPA Auto Parts,"Santa Clara, CA 95054",Driving,"San Jose, California",2023-02-04 03:17:56.658,2023-02-04 03:18:49.334937,,$19.02 an hour,,,\n \n Job Description\n \n As a Full -Time D...,https://www.indeed.com/rc/clk?jk=e0202ef963f32...


In [15]:
technology = df[df['searchterm'] == 'Technology']
technology.head()

Unnamed: 0,id,vendorid,positionname,company,location,searchterm,searcharea,scrapedat,createdat,postedat,salary,benefits,requirements,description,indeedlink
261,10717,43ae324f23164e03,IT Project Manager,"R1 RCM, Inc.","Chicago, IL",Technology,"Chicago, Illinois",2023-02-05 00:21:21.506,2023-02-05 00:23:01.855739,,,,,\n R1 is a leading provider of technology-enab...,https://www.indeed.com/rc/clk?jk=43ae324f23164...
291,10792,d987595072e85b9c,Data Engineer,Northwestern Medicine,"Chicago, IL 60611",Technology,"Chicago, Illinois",2023-02-05 00:22:08.811,2023-02-05 00:23:07.355277,,,,,"\n Benefits\n \n $10,000 Tuition Reimburseme...",https://www.indeed.com/rc/clk?jk=d987595072e85...
3935,6430,5eb740753a34659a,Software Quality Assurance Tester,PRECISIONxtract,"Indianapolis, IN",Technology,"Indianapolis, Indiana",2023-02-04 20:37:57.325,2023-02-04 20:39:02.195272,,,,,\n The Software Quality Assurance Tester role ...,https://www.indeed.com/rc/clk?jk=5eb740753a346...
4133,6431,e9ac3acc074f4201,Software Engineer,Indiana University,"Indianapolis, IN 46202",Technology,"Indianapolis, Indiana",2023-02-04 20:37:59.742,2023-02-04 20:39:02.264564,,,,,\n \n \n Department\n \n \n \n ...,https://www.indeed.com/rc/clk?jk=e9ac3acc074f4...
5062,6308,c61746423b2d680c,Certified ServiceNow Solution Developer - Secu...,Deloitte,"Indianapolis, IN 46204",Technology,"Indianapolis, Indiana",2023-02-04 20:36:23.927,2023-02-04 20:38:53.566755,,,,,\n ServiceNow Solution Developer \n \n Are y...,https://www.indeed.com/rc/clk?jk=c61746423b2d6...


In [18]:
technology['description'].iloc[0]

'\n R1 is a leading provider of technology-enabled revenue cycle management services which transform and solve challenges across health systems, hospitals and physician practices. Headquartered in Chicago, R1 is publicly traded organization with employees throughout the US and international locations. \n Our mission is to be the one trusted partner to manage revenue, so providers and patients can focus on what matters most. Our priority is to always do what is best for our clients, patient’s and each other. With our proven and scalable operating model, we complement a healthcare organization’s infrastructure, quickly driving sustainable improvements to net patient revenue and cash flows while reducing operating costs and enhancing the patient experience. \n \n The IT Project Manager will be responsible for providing strategic and tactical project and program management oversight. This role will support IT, senior leaders, product owners, key stake holders and various other internal tea

In [4]:
import nltk
from collections import Counter

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [19]:
tokens = technology['description'].iloc[0].lower().split()

In [20]:
tokens

['r1',
 'is',
 'a',
 'leading',
 'provider',
 'of',
 'technology-enabled',
 'revenue',
 'cycle',
 'management',
 'services',
 'which',
 'transform',
 'and',
 'solve',
 'challenges',
 'across',
 'health',
 'systems,',
 'hospitals',
 'and',
 'physician',
 'practices.',
 'headquartered',
 'in',
 'chicago,',
 'r1',
 'is',
 'publicly',
 'traded',
 'organization',
 'with',
 'employees',
 'throughout',
 'the',
 'us',
 'and',
 'international',
 'locations.',
 'our',
 'mission',
 'is',
 'to',
 'be',
 'the',
 'one',
 'trusted',
 'partner',
 'to',
 'manage',
 'revenue,',
 'so',
 'providers',
 'and',
 'patients',
 'can',
 'focus',
 'on',
 'what',
 'matters',
 'most.',
 'our',
 'priority',
 'is',
 'to',
 'always',
 'do',
 'what',
 'is',
 'best',
 'for',
 'our',
 'clients,',
 'patient’s',
 'and',
 'each',
 'other.',
 'with',
 'our',
 'proven',
 'and',
 'scalable',
 'operating',
 'model,',
 'we',
 'complement',
 'a',
 'healthcare',
 'organization’s',
 'infrastructure,',
 'quickly',
 'driving',
 'sust

In [118]:
from nltk.corpus import stopwords
from collections import Counter

def remove_stopwords(tokens: list):
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    # count = Counter(filtered)
    # print(count.most_common(100))
    return filtered

filtered = remove_stopwords(tokens)

In [34]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\allen\AppData\Roaming\nltk_data...


True

In [41]:
from nltk.stem import WordNetLemmatizer

def lemmatize_tokens(tokens, lemmatizer):
    lemmatized_tokens = []
    for item in tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(item))
    return lemmatized_tokens

lemmatizer = WordNetLemmatizer()
lemmatized = lemmatize_tokens(filtered, lemmatizer)

In [37]:
lemma_count = Counter(lemmatized)
print(lemma_count.most_common(100))

[('project', 17), ('team', 10), ('management', 9), ('work', 9), ('business', 8), ('various', 6), ('experience', 5), ('manage', 4), ('patient', 4), ('manager', 4), ('role', 4), ('offshore', 4), ('status', 4), ('r1', 3), ('service', 3), ('healthcare', 3), ('program', 3), ('support', 3), ('senior', 3), ('product', 3), ('key', 3), ('provide', 3), ('communication', 3), ('working', 3), ('including', 3), ('&', 3), ('disability', 3), ('provider', 2), ('revenue', 2), ('across', 2), ('u', 2), ('partner', 2), ('focus', 2), ('priority', 2), ('proven', 2), ('operating', 2), ('providing', 2), ('it,', 2), ('also', 2), ('onshore', 2), ('define', 2), ('solutions.', 2), ('success', 2), ('plan', 2), ('monitor', 2), ('cross-functional', 2), ('information', 2), ('security,', 2), ('agile', 2), ('environment', 2), ('collaboration', 2), ('result', 2), ('help', 2), ('make', 2), ('collaborative', 2), ('communicate', 2), ('unit', 2), ('successful', 2), ('completion.', 2), ('task', 2), ('training,', 2), ('develop

In [117]:
def create_huge_list_of_cleaned_tokens(df):
    cleaned_tokens = []
    for description in list(df['description']):
        tokens = description.lower().split()
        filtered = remove_stopwords(tokens)

        lemmatizer = WordNetLemmatizer()
        lemmatized = lemmatize_tokens(filtered, lemmatizer)
        
        cleaned_tokens.extend(lemmatized)

    return cleaned_tokens

In [None]:
cleaned_tokens = create_huge_list_of_cleaned_tokens(technology)

In [None]:
# TF IDF lets me input a search term and get scores showing which documents are the most relevant to it
# In this case the search terms are words in the job descriptions, and the documents are the job descriptions themselves
# So this is in the opposite direction of what Lee was trying to do, which is find the keywords in the job descriptions

In [48]:
tech_tc = nltk.TextCollection(technology['description'])

In [59]:
tech_tc.tf_idf('Project', technology['description'].iloc[0])

0.0015289782093727126

In [89]:
# This job posting from Northeastern has a description of just '', an empty string. Breaks below code without the try/catch
technology.iloc[1504]

id                                                          84041
vendorid                                         63ff8eb992fbda51
positionname                                                     
company                                   Northeastern University
location                                                         
searchterm                                             Technology
searcharea                                  Boston, Massachusetts
scrapedat                              2023-02-22 07:56:55.250000
createdat                              2023-02-22 07:58:38.565036
postedat                                                     None
salary                                                       None
benefits                                                     None
requirements                                                 None
description                                                      
indeedlink      https://www.indeed.com/rc/clk?jk=63ff8eb992fbd...
Name: 4540

In [110]:
# TextCollection provides tf, idf, and tf_idf abstractions so
# that we don't have to maintain/compute them ourselves
def find_jobs_most_relevant_to_keywords(QUERY_TERMS):
    relevant_jobs = []

    for idx in range(len(technology['description'])):
        score = 0
        for term in [t.lower() for t in QUERY_TERMS]:
            try:
                score += tech_tc.tf_idf(term, technology['description'].iloc[idx])
            except:
                print('Error. Index was {}, term was {}, description was{}. \n\n'.format(idx, term, technology['description'].iloc[idx]))
        if score > 0:
            relevant_jobs.append({'score': score,
                                'index': idx,
                                'position_name': technology['positionname'].iloc[idx], 
                                'company': technology['company'].iloc[idx]})

    # Sort by score and display results

    relevant_jobs = sorted(relevant_jobs, key=lambda p: p['score'], reverse=True)
    for job in relevant_jobs[0:5]:
        print('Position Name: {0}'.format(job['position_name']))
        print('Index: {0}'.format(job['index']))
        print('Company: {}'.format(job['company']))
        print('Score: {0}'.format(job['score']))
        print()

In [111]:
find_jobs_most_relevant_to_keywords(QUERY_TERMS = ['Java', 'Selenium'])

Error. Index was 1504, term was java, description was. 


Error. Index was 1504, term was selenium, description was. 


Position Name: Java Developer/Architect
Index: 3366
Company: Ghritachi Inc
Score: 0.009353780113539803

Position Name: Salesforce Developer (Only W2)
Index: 647
Company: Metasys Technologies
Score: 0.004859815704057458

Position Name: Full stack Java Developer
Index: 3107
Company: plaxonic
Score: 0.004195476079870275

Position Name: Full Stack Java Developer
Index: 3071
Company: Plaxonic Technologies Inc.
Score: 0.0039238265495189625

Position Name: Full Stack Java Developer
Index: 3127
Company: DBSI Services
Score: 0.003104358395350598



In [114]:
find_jobs_most_relevant_to_keywords(QUERY_TERMS = ['Python'])

Error. Index was 1504, term was python, description was. 


Position Name: Automation Engineer
Index: 2094
Company: Kastech Software Solutions Group
Score: 0.011586319264647888

Position Name: QA Automation Engineer - Firmware
Index: 1720
Company: ASK Consulting
Score: 0.007275130701057977

Position Name: Technical Lead
Index: 1702
Company: Wipro Limited
Score: 0.0029793392394808858

Position Name: Azure Big data SRE
Index: 3467
Company: Emonics LLC
Score: 0.002914570995144345

Position Name: PySpark/Abinitio Developer
Index: 755
Company: Capgemini
Score: 0.002502644961163944



In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer