In [16]:
#Import necessary packages
import pandas as pd
from bs4 import BeautifulSoup
import requests as re

import time
import os
import nltk
from nltk.util import ngrams
from collections import Counter

In [17]:
#Read in data
data = pd.read_csv('starting_file.csv')
data.head()

Unnamed: 0,job_title,org_name,salary_min,salary_max,director_flag,analyst_flag,scientist_flag,manager_flag,engineer_flag,url
0,Director,Centers of Excellence,132552.0,183300.0,1.0,,,,,https://www.usajobs.gov/GetJob/ViewDetails/615...
1,Artificial Intelligence Specialist,Centers of Excellence,142701.0,170800.0,,,,,,https://drive.google.com/file/d/12Ik1jP78cQyY9...
2,Data and Analytics Specialist,Centers of Excellence,142701.0,170800.0,,1.0,,,,https://drive.google.com/file/d/1abhv-KfudFvIX...
3,Open Data Specialist,GSA,109366.0,142180.0,,,,,,https://drive.google.com/file/d/1PwKFGcJ6hMKVr...
4,Technology Portfolio Director,TTS Operations,106595.0,138572.0,1.0,,,,,https://drive.google.com/file/d/1bP0kbXeTeKEIs...


In [5]:
#Evaluate duplicate titles/org names
#Not currently used, but should use later
with open(f'./jd-data/dups.txt','w') as f:    
    f.write(str(data[['job_title','org_name','url']].groupby(['job_title','org_name']).count()))

In [98]:
#Pull html files from data file
for i in range(0,data.shape[0]):
    num = i
    filename = f'{data.org_name[num]} - {data.job_title[num]}'
    if not os.path.exists(f'./jd-data/{filename}.txt'):
        r = re.get(data.url[num])
        with open(f'./jd-data/{filename}.txt','w') as f:
            f.write(r.text)
        time.sleep(3)

In [113]:
#Extract text from html files
for i in range(0,data.shape[0]):
    num = i
    filename = f'{data.org_name[num]} - {data.job_title[num]}'
    if os.path.exists(f'./jd-data/{filename}.txt'):
        with open(f'./jd-data/{filename}.txt','r') as f:
            soup = BeautifulSoup(f, 'html.parser')
        with open(f'./jd-soup/soup - {filename}.txt','w') as f:
            f.write(soup.get_text().strip())


In [114]:
#Normalize data in job postings and find bigrams

token_list = []
bigram_list = []

for i in range(0,data.shape[0]):
    num = i
    filename = f'{data.org_name[num]} - {data.job_title[num]}'
    with open(f'./jd-soup/soup - {filename}.txt','r') as f:
        test = f.read()

    tokens = nltk.word_tokenize(test.replace('\n',' ').strip())
    from nltk.corpus import stopwords
    stemmer = nltk.stem.PorterStemmer()

    stop_words = set(stopwords.words('english'))
    manual_stop = {'usajobs','announcement','job'
                ,'unit','keyword','search','official'
                ,'open','application','submit','button','must'}
    stop_words = stop_words.union(manual_stop)

    remove_stopwords = [x for x in tokens if not x.lower() in stop_words]
    remove_punct=[word.lower() for word in remove_stopwords if word.isalpha()]

    test_stems = [stemmer.stem(x) for x in remove_punct]

    token_list.append(test_stems)

    bigram_list.append(list(ngrams(test_stems, 2)))
    
    with open(f'./jd-tokens/tokens - {filename}.txt','w') as f:
        for line in test_stems:
            f.write(f"{line}\n")

    with open(f'./jd-bigrams/bigrams - {filename}.txt','w') as f:
        for line in Counter(ngrams(test_stems, 2)):
            f.write(f"{line}\n")

In [115]:
#Add tokens and bigrams to dataframe
data['tokens'] = token_list
data['bigrams'] = bigram_list
data.head()

Unnamed: 0,job_title,org_name,salary_min,salary_max,director_flag,analyst_flag,scientist_flag,manager_flag,engineer_flag,url,tokens,bigrams
0,Director,Centers of Excellence,132552.0,183300.0,1.0,,,,,https://www.usajobs.gov/GetJob/ViewDetails/615...,"[websit, unit, state, govern, help, keyword, l...","[(websit, unit), (unit, state), (state, govern..."
1,Artificial Intelligence Specialist,Centers of Excellence,142701.0,170800.0,,,,,,https://drive.google.com/file/d/12Ik1jP78cQyY9...,"[titl, center, excel, artifici, intellig, spec...","[(titl, center), (center, excel), (excel, arti..."
2,Data and Analytics Specialist,Centers of Excellence,142701.0,170800.0,,1.0,,,,https://drive.google.com/file/d/1abhv-KfudFvIX...,"[titl, center, excel, data, analyt, specialist...","[(titl, center), (center, excel), (excel, data..."
3,Open Data Specialist,GSA,109366.0,142180.0,,,,,,https://drive.google.com/file/d/1PwKFGcJ6hMKVr...,"[googl, drivesign]","[(googl, drivesign)]"
4,Technology Portfolio Director,TTS Operations,106595.0,138572.0,1.0,,,,,https://drive.google.com/file/d/1bP0kbXeTeKEIs...,"[googl, drivesign]","[(googl, drivesign)]"


In [116]:
#If you want to subset out the job you're comparing to, can do here
director_ds = data[data.job_title=='Director of Data Science'].reset_index()
sample_jobs = data[data.job_title!='Director of Data Science'].reset_index()
all_jobs = data.copy()
director_ds

Unnamed: 0,index,job_title,org_name,salary_min,salary_max,director_flag,analyst_flag,scientist_flag,manager_flag,engineer_flag,url,tokens,bigrams
0,27,Director of Data Science,State of Colorado,86916.0,99960.0,1.0,,,,,https://www.governmentjobs.com/careers/colorad...,"[bulletin, state, colorado, director, data, sc...","[(bulletin, state), (state, colorado), (colora..."


In [117]:
#Extract bigrams of interest
bigram_skill_search = {
    "data_science":('data', 'scienc'),
    "data_engineering":('data', 'engin'),
    "database_design":('databas', 'design'),
    "data_analysis":('data', 'analysi'),
    "collaborate_stakeholders":('collabor', 'stakehold'),
    "data_quality":('data', 'qualiti'),
    "strong_communication":('strong', 'commun'),
    "communication_skills":('commun', 'skill'),
    "data_pipelines":('data', 'pipelin'),
    "unstructured_data":('unstructur', 'data'),
    "data_lake":('data', 'lake'),
    "data_model":('data', 'model'),
    "data_scientist":('data', 'scientist'),
    "data_requirements":('data', 'requir'),
    "team_management":('team', 'manag'),
    "data_manipulation":('data', 'manipul'),
    "data_warehouse":('data', 'wareh'),
    "machine_learning":('machin', 'learn')
}

bigram_requirements = {
    "bachelor_degree":('bachelor', 'degre'),
    "masters_degree":('master', 'doctor')
}

monogram_skill_search = ['sql','python','tableau','r']

In [118]:
#Create bool fields where bigram is detected
for i in bigram_skill_search:
    temp_list = []
    for j in range(0,all_jobs.shape[0]):
        if bigram_skill_search[i] in all_jobs.bigrams[j]:
            temp_list.append(1)
        else:
            temp_list.append(0)
    all_jobs[i] = temp_list

for i in monogram_skill_search:
    temp_list = []
    for j in range(0,all_jobs.shape[0]):
        if i in all_jobs.tokens[j]:
            temp_list.append(1)
        else:
            temp_list.append(0)
    all_jobs[i] = temp_list

all_jobs.head(10)

Unnamed: 0,job_title,org_name,salary_min,salary_max,director_flag,analyst_flag,scientist_flag,manager_flag,engineer_flag,url,...,data_scientist,data_requirements,team_management,data_manipulation,data_warehouse,machine_learning,sql,python,tableau,r
0,Director,Centers of Excellence,132552.0,183300.0,1.0,,,,,https://www.usajobs.gov/GetJob/ViewDetails/615...,...,0,0,0,0,0,0,0,0,0,0
1,Artificial Intelligence Specialist,Centers of Excellence,142701.0,170800.0,,,,,,https://drive.google.com/file/d/12Ik1jP78cQyY9...,...,0,0,0,0,0,1,0,0,0,1
2,Data and Analytics Specialist,Centers of Excellence,142701.0,170800.0,,1.0,,,,https://drive.google.com/file/d/1abhv-KfudFvIX...,...,0,0,0,0,1,1,1,1,1,1
3,Open Data Specialist,GSA,109366.0,142180.0,,,,,,https://drive.google.com/file/d/1PwKFGcJ6hMKVr...,...,0,0,0,0,0,0,0,0,0,0
4,Technology Portfolio Director,TTS Operations,106595.0,138572.0,1.0,,,,,https://drive.google.com/file/d/1bP0kbXeTeKEIs...,...,0,0,0,0,0,0,0,0,0,0
5,Analyst,State of California,,,,1.0,,,,https://docs.google.com/document/d/1KizKFfpnk6...,...,0,0,0,0,0,0,0,0,0,0
6,Business Intelligence Analyst,State of California,,,,1.0,,,,https://docs.google.com/document/d/16sfsX3Ss6B...,...,1,0,0,1,0,0,1,1,1,1
7,Business Solutions Analyst,State of California,,,,1.0,,,,https://docs.google.com/document/d/1nbO1eBIhy1...,...,0,0,0,0,0,0,0,0,0,0
8,Data Engineer,State of California,,,,,,,1.0,https://docs.google.com/document/d/1Nn9I8RiVNx...,...,1,0,0,1,0,0,1,1,0,0
9,Data Analyst,State of California,,,,1.0,,,,https://docs.google.com/document/d/1HehRy29Sk_...,...,1,0,0,0,0,0,1,0,0,0


In [119]:
#Create bool fields to categorize job level type
import numpy as np
all_jobs['JobType'] = np.where(
     all_jobs['director_flag']==1,
    'Director', 
     np.where(all_jobs['analyst_flag']==1, 'Data Analyst', 
            np.where(all_jobs['scientist_flag']==1, 'Data Scientist', 
               np.where(all_jobs['manager_flag']==1, 'Manager',
                  np.where(all_jobs['engineer_flag']==1, 'Data Engineer', 
                     'Unknown'
                  )
               )
            )
      )
)

In [120]:
#Output to csv
all_jobs[['job_title', 'org_name', 'salary_min', 'salary_max',
       'director_flag', 'analyst_flag', 'scientist_flag', 'manager_flag',
       'engineer_flag', 'url', 'data_science',
       'data_engineering', 'database_design', 'data_analysis',
       'collaborate_stakeholders', 'data_quality', 'strong_communication',
       'communication_skills', 'data_pipelines', 'unstructured_data',
       'data_lake', 'data_model', 'data_scientist', 'data_requirements',
       'team_management', 'data_manipulation', 'data_warehouse',
       'machine_learning', 'sql', 'python', 'tableau','JobType']].to_csv('simplified_output.csv')

In [121]:
#Output to csv
all_jobs.to_csv('full_output.csv')

In [122]:
#Cluser together bigram skill scoring
perc_similar = all_jobs[['job_title', 'org_name', 'salary_min', 'salary_max',
       'director_flag', 'analyst_flag', 'scientist_flag', 'manager_flag',
       'engineer_flag', 'data_science',
       'data_engineering', 'database_design', 'data_analysis',
       'collaborate_stakeholders', 'data_quality', 'strong_communication',
       'communication_skills', 'data_pipelines', 'unstructured_data',
       'data_lake', 'data_model', 'data_scientist', 'data_requirements',
       'team_management', 'data_manipulation', 'data_warehouse',
       'machine_learning', 'sql', 'python', 'tableau','r']].copy()

In [123]:
#Calculate how many bigram skills should be available
num_skills = len(bigram_skill_search)+len(monogram_skill_search)

In [124]:
#Generate similarity vectors based on number of bigram values found
perc_similar['similarity_count']=perc_similar.iloc[:, -num_skills-1:].sum(axis=1)
perc_similar['similarity_perc']=perc_similar['similarity_count']/num_skills

In [125]:
#Print out top matching jobs
perc_similar[['job_title','similarity_perc']].sort_values(by='similarity_perc',ascending=False).head(10)

Unnamed: 0,job_title,similarity_perc
27,Director of Data Science,1.0
36,Data Engineering Specialist,0.590909
35,Senior Data Engineer,0.590909
59,Data Scientist,0.5
93,Data Scientist,0.5
78,SENIOR DATA SCIENTIST,0.5
105,Data Scientist,0.454545
10,Data Scientist,0.454545
14,Senior Data Scientist,0.454545
37,Bloomberg Data Engineer,0.409091


In [127]:
#Print out jobs with no matching values
perc_similar[['job_title','org_name','similarity_perc']][perc_similar.similarity_perc == 0].shape

(35, 3)