In [2]:
import pandas as pd
import nltk
nltk.download('stopwords')
import re

import spacy
nlp = spacy.load('en_core_web_sm')

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michelle/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


use cases: we want to have some automatic custom tagging of job descriptions
to do this, we can mine some keywords and group similar jobs together

In [3]:
df = pd.read_json("ww_data_2920_jobs_w_wtr.json")

In [4]:
df = df.drop("division")
df = df.drop("graphs")
df = df.drop("hire_history")
df = df.drop("organization")

In [6]:
df = df.dropna(axis=1)

In [7]:
df_final = pd.DataFrame(columns = ['id','job_title','job_summary','job_responsibilities','required_skills','duration','special_job_reqs','app_docs'])

for x in df.columns:
    id = x
    try:
        title = df[x][3]['Job Title']
    except:
        title = ""
    try: 
        summary = df[x][3]['Job Summary']
    except:
        summary = ""
    try:
        responsibilities = df[x][3]['Job Responsibilities']
    except:
        responsibilities = ""
    try:
        requirements = df[x][3]['Required Skills']
    except:
        requirements = ""
    try:
        duration = df[x][3]['Work Term Duration']
    except:
        duration = ""
    try:
        special_job_requirements = df[x][3]['Special Job Requirements']
    except:
        special_job_requirements = ""
    app_docs = df[x][0]['Application Documents Required']
    
    df_final = df_final.append({'id':id,'job_title':title,'job_summary':summary,
    'job_responsibilities':responsibilities,'required_skills':requirements,
    'duration':duration, 'special_job_reqs':special_job_requirements,'app_docs':app_docs},ignore_index=True)
    df_final = df_final.drop_duplicates()
    df_final['text'] = df_final['job_responsibilities'] + df_final['required_skills']

In [14]:
def get_reqs(text):
    reqs = []
    if (('University of Waterloo Co-op Work History') in text):
        reqs.append('work history')
    if('Cover Letter' in text):
        reqs.append('cover letter')
    if('Résumé' in text):
        reqs.append('resume')
    if('Grade Report' in text):
        reqs.append("grade report")
    if('Other' in text):
        reqs.append('other - per job posting')
    return reqs

def get_duration(text):
    if ('4 month' in text):
        return '4 month'
    if ('8 month' in text):
        if('preferred' in text):
            return '8 month preferred'
        if('required' in text):
            return '8 month required'
    if ('2 work term' in text):
        if('preferred' in text):
            return '2 work terms preferred'
        if('required' in text):
            return '2 work terms required'
    return text

def get_special_reqs(text):
    list = []
    if('directly to the employer' in text):
        list.append("external application")
    if ('Canadian citizen' in text):
        list.append("Canadian citizenship")
    if('permanent resident' in text):
        list.append("permanent resident")
    if('fully vaccinated' in text):
        list.append("fully vaccinated")
    if('eligible to work in the USA' in text or 'USA visa' in text):
        list.append("USA work eligibility")
    if('This job requires you to work remotely from CANADA' in text):
        list.append("remote requirement Canada")
    if('Security Clearance' in text):
        list.append('security clearance')
    
    return list

In [15]:
df_final['app_docs_tags'] = df_final.apply(lambda row: get_reqs(row['app_docs']),axis=1)
df_final['duration_tags'] = df_final.apply(lambda row: get_duration(row['duration']),axis=1)
df_final['special_reqs_tags'] = df_final.apply(lambda row: get_special_reqs(row['special_job_reqs']),axis=1)

In [17]:
clean_html = re.compile('<.*?>') 
def sanitize(text):
    #normalize and remove html
    text = re.sub(clean_html, '', text)
    text = text.lower()
    text = re.sub("&nbsp;","",text)
    text = re.sub("&amp;","",text)
    text = re.sub("\\n","",text)
    text = re.sub("\\'","\'",text)
    text = re.sub("\\t","",text)
    text=re.sub("    ","",text)
    #text = re.sub(" ?","",text)
    #remove unicode
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    #remove misc
    text = re.sub(" job description  summary ","",text)
    return text

In [18]:
df_final['sanitized'] = df_final.apply(lambda row: sanitize(row['text']),axis=1)

In [19]:
def no_number_preprocessor(tokens):
    r = re.sub('(\d)+', 'NUM', tokens.lower())
    # This alternative just removes numbers:
    # r = re.sub('(\d)+', '', tokens.lower())
    return r

In [49]:
sw_custom = ['exceptional','working','hours','classes','wellness','national','pride','interns','new','outstanding','april',
'customers','drivers','work','office',
'workday','month','internship','job','seeabove','co','op','co-op','li',
'clients', 'examples', 'questions', 'experience', 'postsecondary', 'position', 'group','services',
'priorities', 'requirement', 'specialists', 'others', 'member','role', 'preference', 'commitment', 'ability', 
'stakeholders','skills', 'people', 'challenges', 'opportunity', 'interest','company','waterlooworks','companies',
'colleagues','applicants','students','interested applicants','interested','student','january','companies','company','advisors']

In [50]:
n_gram_range = (1, 2)
stop_words = stopwords.words('english')
stop_words.extend(sw_custom)

model = SentenceTransformer('distilbert-base-nli-mean-tokens')

counter = 0

def get_keywords(text,n):
    try:
        count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words,preprocessor = no_number_preprocessor).fit([text])
        all_candidates = count.get_feature_names()
    except:
        return []
    doc = nlp(text)
    noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)

    nouns = set()
    for token in doc:
        if token.pos_ == "NOUN":
            nouns.add(token.text)
    all_nouns = nouns.union(noun_phrases)
    candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))

    doc_embedding = model.encode([text])
    candidate_embeddings = model.encode(candidates)

    top_n = n
    try:
        distances = cosine_similarity(doc_embedding, candidate_embeddings)
    except:
        return []
    keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    return keywords


In [51]:
df_final['keywords'] = df_final.apply(lambda row: get_keywords(row['text'],5),axis=1)

In [42]:
df_tags = df_final.drop(columns=['job_title','job_summary','job_responsibilities','required_skills','duration','special_job_reqs','app_docs','text','sanitized'])

In [43]:
df_tags_id = df_tags.set_index('id')

In [44]:
df_tags_id

Unnamed: 0_level_0,app_docs_tags,duration_tags,special_reqs_tags,keywords
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
272204,"[work history, cover letter, resume, grade rep...",2 work terms preferred,[fully vaccinated],"[optics, python programming, mathematica, quan..."
278145,"[work history, cover letter, resume, grade rep...",4 month,[],"[environmental conditions, sheet metal, manual..."
278340,"[work history, cover letter, resume, grade rep...",8 month required,[],"[facilitation, technical expertise, technical ..."
278501,"[work history, resume, grade report]",4 month,[],"[facebook, instagram, fundraising, business in..."
278832,"[work history, resume, grade report]",4 month,"[fully vaccinated, USA work eligibility]","[team, redesign, engineers, week, startup]"
...,...,...,...,...
295823,"[work history, cover letter, resume, grade rep...",4 month,[],"[waste, shipping, ontario regulation, accounti..."
295824,"[work history, resume, grade report]",4 month,[],"[proficiency, forecasts, accounting, good visi..."
295825,"[work history, cover letter, resume, grade rep...",8 month required,[],"[working knowledge, workflows, mechanical desi..."
295829,"[work history, resume, grade report]",8 month required,[external application],"[innovative thinking, program performance, vac..."


In [112]:
from transformers import AutoModel, AutoTokenizer
model_name = "distilroberta-base"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)

def get_keywords_2(text,n):
    count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words,preprocessor = no_number_preprocessor).fit([text])
    all_candidates = count.get_feature_names()
    doc = nlp(text)
    noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)

    nouns = set()
    for token in doc:
        if token.pos_ == "NOUN":
            nouns.add(token.text)
    all_nouns = nouns.union(noun_phrases)
    candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))
    candidate_tokens = tokenizer(candidates, padding=True, truncation=True, max_length = 512,return_tensors="pt")
    candidate_embeddings = model(**candidate_tokens)["pooler_output"]
    text_tokens = tokenizer([text], padding=True, truncation=True, max_length = 512,return_tensors="pt")
    text_embedding = model(**text_tokens)["pooler_output"]

    candidate_embeddings = candidate_embeddings.detach().numpy()
    text_embedding = text_embedding.detach().numpy()

    top_n = n
    distances = cosine_similarity(text_embedding, candidate_embeddings)
    keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
    return keywords

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [108]:
df_final['keywords_2'] = df_final.apply(lambda row: get_keywords_2(row['text'],5),axis=1)

In [110]:
df_final.to_csv("df_final_keywords_5.csv",index=False)

In [53]:
df_final['keywords_2'][15]

['user', 'experiences', 'fitness classes', 'creed', 'software']

In [109]:
df_final

Unnamed: 0,id,job_title,job_summary,job_responsibilities,required_skills,duration,text,sanitized,keywords_2,keywords
0,280880,January 2023 Tax MySolutions- 4 Month Co-op (C...,<strong>Interested applicants must apply throu...,<strong>What you'll create and do</strong> \n<...,<strong>What you'll bring to this role:</stron...,4 month work term,<strong>What you'll create and do</strong> \n<...,what youll create and do pwcs tax professional...,"[application, pwc, business issues, profession...","[accounting, financial reporting, business iss..."
1,280881,January 2023 Tax MySolutions- 4 Month Co-op (C...,<strong>Interested applicants must apply throu...,<strong>What you'll create and do</strong> \n<...,<strong>What you'll bring to this role:</stron...,4 month work term,<strong>What you'll create and do</strong> \n<...,what youll create and do pwcs tax professional...,"[application, pwc, business issues, profession...","[accounting, financial reporting, business iss..."
2,280886,January 2023 Tax MySolutions- 8 Month Co-op (C...,<strong>Interested applicants must apply throu...,<strong>What you'll create and do</strong> \n<...,<strong>What you'll bring to this role:</stron...,8 month consecutive work term required,<strong>What you'll create and do</strong> \n<...,what youll create and do pwcs tax professional...,"[application, pwc, business issues, profession...","[accounting, financial reporting, business iss..."
3,280889,January 2023 Tax MySolutions- 8 Month Co-op (C...,<strong>Interested applicants must apply throu...,<strong>What you'll create and do</strong> \n<...,<strong>What you'll bring to this role:</stron...,8 month consecutive work term required,<strong>What you'll create and do</strong> \n<...,what youll create and do pwcs tax professional...,"[application, pwc, business issues, profession...","[accounting, financial reporting, business iss..."
4,280890,January 2023 Tax MySolutions- 8 Month Co-op (C...,<strong>Interested applicants must apply throu...,<strong>What you'll create and do</strong> \n<...,<strong>What you'll bring to this role:</stron...,8 month consecutive work term required,<strong>What you'll create and do</strong> \n<...,what youll create and do pwcs tax professional...,"[application, pwc, business issues, profession...","[accounting, financial reporting, business iss..."
...,...,...,...,...,...,...,...,...,...,...
428,292221,Junior Policy Analyst,"Are you interested in technology, cutting-edge...","<a name=""_Hlk92288386""></a> \n<br>\nDuties may...","<a name=""_Hlk92288398""><strong>Essential skill...",4 month work term,"<a name=""_Hlk92288386""></a> \n<br>\nDuties may...",duties may include writing briefing notes and...,"[analytics, employment equity, application, mi...","[data science, employment equity, briefing not..."
429,292251,Junior Accountant,WinnerMax Capital Inc. is a Canadian instituti...,What will you do \n<br>\n&nbsp; \n<br>\nWorkin...,What you need to succeed \n<br>\n&nbsp; \n<br>...,8 month consecutive work term preferred,What will you do \n<br>\n&nbsp; \n<br>\nWorkin...,what will you do working closely with our acc...,"[vender payments, claimswhat, records, banking...","[business, journal entry, banking, accounting,..."
430,292409,Experiential Marketing + Event & Sampling Coor...,<strong>Who we are:&nbsp;</strong> \n<br>\nAt ...,<strong>Experiential Marketing + Event &amp; S...,<strong>You have:&nbsp;</strong>\n\n \n<ul>\n ...,4 month work term,<strong>Experiential Marketing + Event &amp; S...,experiential marketing event sampling coordi...,"[activities, coordinators, content, demo coord...","[demo coordinators, google, brand awareness, s..."
431,292454,"Software Engineering: Kernel, Systems, Memory,...",MemVerge is seeking software developers who de...,A Few of Your Key Responsibilities could inclu...,<strong>Software Engineer - Systems Software</...,8 month consecutive work term required,A Few of Your Key Responsibilities could inclu...,a few of your key responsibilities could inclu...,"[file systems, related unit, virtualization, s...","[engineering, performance tuning, virtualizati..."
