In [1]:
#Credit to Edward Ross - https://skeptric.com/notebooks/Parsing%20Experience%20from%20Adzuna%20Job%20Ads.html
import re
import pandas as pd
import spacy
from spacy.util import filter_spans
from spacy.tokens import Span
from spacy.matcher import Matcher

In [2]:
from spacy import displacy
from IPython.display import HTML, display


In [3]:
import en_core_web_sm
nlp = en_core_web_sm.load()

In [4]:
df = pd.read_csv('outputs/data_science_jobs.csv')

In [5]:
df = df.drop(df.columns[0], axis = 1) 

In [6]:
import regex as re
df['Job Description'] = df['Job Description'].apply(lambda x: re.sub(r'[!;,\s]', ' ', x))
df.sample(50, random_state = 5)

Unnamed: 0,Job Title,Company Name,Job Description,Location,Job Board
852,HEAD OF ANALYTICS & INSIGHTS/DATA SCIENTIST,Saje Natural Wellness,For over 25 years we’ve been helping our commu...,Vancouver,Indeed
207,Machine Learning Engineer,Aviva,About the job Individually we are people but ...,Markham,Indeed
627,"Data Engineer / Power BI Developer, Omnia AI",Deloitte\n,Job Type: Permanent Primary Location: Vancouve...,Vancouver,Glassdoor
671,Junior Business Intelligence Developer - Data ...,CAPRiser Group,Our subsidiary is in PropTech and real estate ...,Toronto,Indeed
661,"Data Scientist Technical Lead (Montreal), Inta...",Intact\n,Who needs insurance? Everybody. That keeps us ...,Montreal,Glassdoor
489,Bio informaticien/Bioinformatics scientist,GenAIz\n,Avant le SRAS-CoV-2 un vaccin qui était dével...,Montreal,Glassdoor
434,Data Scientist,Awakedata,RESPONSIBILITIES: Develop state-of-the-art com...,Burnaby,Glassdoor
448,Scientifique de données / Data Scientist,BusPatrol\n,Qui sommes-nous : BusPatrouille est une entrep...,Montreal,Glassdoor
634,Data Management Specialist - Immunotoxicology,Charles River Laboratories\n,For 70 years Charles River employees have wor...,Senneville,Glassdoor
748,Data Scientist,F8th,We're transforming the Cyber Security industry...,Midtown Toronto,Glassdoor


In [7]:
ads = list(df['Job Description'])

In [8]:
len(ads)

881

In [9]:
def highlight_terms(terms, texts):
    for doc in nlp.pipe(texts):
        for sentence in set([tok.sent for tok in doc if tok.lower_ in terms]):
            text = sentence.text.strip()
            markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>', text)
            display(HTML(markup))
            print('-----')

In [10]:
highlight_terms(['experience'], ads[:10])

-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


-----


In [11]:
matcher = Matcher(nlp.vocab)
pattern = [{'POS': 'NOUN', 'OP': '+'}, {'LOWER': 'experience'}]
matcher.add('experience_noun', [pattern])

pattern = [{'LOWER': 'experience'}, {'POS': 'ADP'}, {'POS': {'IN': ('DET', 'NOUN', 'PROPN')}, 'OP': '+'}]
matcher.add('experience_adp', [pattern])

In [12]:
doc = nlp(ads[0])
matcher(doc)

[(12285600890577657150, 117, 119),
 (1417798585642285709, 239, 242),
 (1417798585642285709, 239, 243),
 (12285600890577657150, 265, 267)]

In [13]:
def show_extraction(examples, *extractors):
    seen = set()
    for doc in nlp.pipe(examples):
        doc.ents = filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)])
        for tok in doc:
            if tok.lower_ == 'experience':
                sentence = tok.sent
                if sentence.text in seen:
                    continue
                seen.update([sentence.text])
                if not sentence.ents:
                    doc.ents = list(doc.ents) + [Span(doc, tok.i, tok.i+1, 'MISSING')]
                displacy.render(sentence, style='ent', options = {'colors': {'MISSING': 'pink',
                                                                            'EXPERIENCE': 'lightgreen'}})
                

In [14]:
show_extraction(ads[:5], matcher)


In [15]:
def get_extractions(examples, *extractors):
    # Could use context instead of enumerate
    for idx, doc in enumerate(nlp.pipe(examples, batch_size=100, disable=['ner'])):
        for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)]):
            sent = ent.root.sent
            yield ent.text, idx, ent.start, ent.end, ent.label_, sent.start, sent.end

In [16]:
list(get_extractions(ads[:3], matcher))


[('user experience', 0, 117, 119, 'experience_noun', 92, 121),
 ('experience in the application', 0, 239, 243, 'experience_adp', 224, 251),
 ('techniques Experience', 0, 265, 267, 'experience_noun', 251, 304),
 ('data science experience', 1, 143, 146, 'experience_noun', 136, 169),
 ('software development experience Experience',
  1,
  157,
  161,
  'experience_noun',
  136,
  169),
 ('Experience with time', 1, 169, 172, 'experience_adp', 169, 178),
 ('environment Experience', 1, 197, 199, 'experience_noun', 178, 212),
 ('fitness experience', 2, 73, 75, 'experience_noun', 70, 76),
 ('experience in ETL', 2, 399, 402, 'experience_adp', 396, 413),
 ('Experience with Informatica IICS', 2, 445, 449, 'experience_adp', 413, 481),
 ('Experience with message queues', 2, 455, 459, 'experience_adp', 413, 481),
 ('experience in a DevOps environment Experience',
  2,
  468,
  474,
  'experience_adp',
  413,
  481)]

In [17]:
def extract_df(*extractors, n_max=None, **kwargs):
    if n_max is None:
        n_max = len(df)
    ent_df = pd.DataFrame(list(get_extractions(df[:n_max]['Job Description'], *extractors)),
                          columns=['text', 'docidx', 'start', 'end', 'label', 'sent_start', 'sent_end'])
    return ent_df.merge(df, how='left', left_on='docidx', right_index=True)

In [18]:
%time 
ent_df = extract_df(matcher, n_max=1000)
ent_df.head()

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 4.29 µs


Unnamed: 0,text,docidx,start,end,label,sent_start,sent_end,Job Title,Company Name,Job Description,Location,Job Board
0,user experience,0,117,119,experience_noun,92,121,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor
1,experience in the application,0,239,243,experience_adp,224,251,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor
2,techniques Experience,0,265,267,experience_noun,251,304,Data Scientist,Metricsflow Inc,At Metricsflow we work to shape the future of...,St. John's,Glassdoor
3,data science experience,1,143,146,experience_noun,136,169,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed
4,software development experience Experience,1,157,161,experience_noun,136,169,Data Scientist - 05/12/21,Acerta Analytics Solutions Inc,Acerta’s machine learning platforms leverage a...,Kitchener,Indeed


In [19]:
def aggregate_df(df, col=['text']):
    return (df
            .groupby(col)
            .agg(n_company=('Company Name', 'nunique'))
            .reset_index()
            .sort_values(['n_company'], ascending=False)
        )

In [20]:
aggregate_df(ent_df).head(10)


Unnamed: 0,text,n_company
1369,work experience,49
1383,years experience,45
1203,industry experience,34
619,customer experience,29
793,experience in a,27
1359,user experience,22
870,experience in data science,20
329,Experience with Python,19
1066,experience with Python,15
931,experience in the,15


In [21]:
def showent(docidx, start, end, label, sent_start, sent_end, **kwargs):
    # We don't need to parse it, so just make_doc
    doc = nlp.make_doc(ads[docidx])
    doc.ents = [Span(doc, start, end, label)]
    sent = doc[sent_start:sent_end]
    displacy.render(sent, style='ent')
    
def showent_df(df):
    for idx, row in df.iterrows():
        showent(**row)

In [22]:
showent_df(ent_df.query('text == "experience in a"').head())


In [23]:
def extract_noun_phrase_experience(doc):
    for np in doc.noun_chunks:
        if np[-1].lower_ == 'experience':
            if len(np) > 1:
                yield 'EXPERIENCE', np[0].i, np[-1].i

In [24]:
show_extraction(ads[:5], extract_noun_phrase_experience)


In [25]:
%time
ent_df = extract_df(extract_noun_phrase_experience, n_max=50000)


CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.1 µs


In [26]:
aggregate_df(ent_df).head(50)


Unnamed: 0,text,n_company
897,hands-on,29
925,industry,28
853,equivalent,25
287,Hands-on,23
1083,relevant,22
253,Extensive,18
1273,years,17
425,Proven,17
875,extensive,16
1265,work,16


In [27]:
experience_qualifiers = ['previous', 'prior', 'following', 'recent', 'the above', 'past',
                         
                         'proven', 'demonstrable', 'demonstrated', 'relevant', 'significant', 'practical',
                         'essential', 'equivalent', 'desirable', 'required', 'considerable', 'similar',
                         'working', 'specific', 'qualified', 'direct', 'hands on', 'handson', 'hands-on'
                         
                         'strong', 'solid', 'good', 'substantial', 'excellent', 'the right', 'valuable', 'invaluable',
                         
                         'some', 'any', 'none', 'much', 'extensive', 'no', 'more',
                         'your', 'their', 'great',
                         'years', 'months',
                        ]

stopwords = ['a', 'an', '*', '**', '.', 'this', 'the', ':', 'Skills']

experience_qualifier_pattern = rf'\b(?:{"|".join(experience_qualifiers)})\b'

experience_qualifier_pattern

'\\b(?:previous|prior|following|recent|the above|past|proven|demonstrable|demonstrated|relevant|significant|practical|essential|equivalent|desirable|required|considerable|similar|working|specific|qualified|direct|hands on|handson|hands-onstrong|solid|good|substantial|excellent|the right|valuable|invaluable|some|any|none|much|extensive|no|more|your|their|great|years|months)\\b'

In [28]:
aggregate_df(ent_df[(~ent_df.text.str.lower().str.contains(experience_qualifier_pattern)) & # Not a qualifier
                     ~ent_df.text.isin(stopwords)]).head(50)

Unnamed: 0,text,n_company
665,hands-on,29
690,industry,28
191,Hands-on,23
946,work,16
791,professional,15
365,Strong,15
573,customer,13
13,),12
289,Programming,11
953,·,11


In [29]:
def extract_adp_experience(doc, label='EXPERIENCE'):
    for tok in doc:
        if tok.lower_ == 'experience':
            for child in tok.rights:
                if child.dep_ == 'prep':
                    for obj in child.children:
                        if obj.dep_ == 'pobj':
                            yield label, obj.left_edge.i, obj.i+1

In [30]:
show_extraction(ads[10:15], extract_adp_experience)

In [31]:
def extract_adp_experience_2(doc):
    for np in doc.noun_chunks:
        start_tok = np[0].i
        if start_tok >= 2 and doc[start_tok - 2].lower_ == 'experience' and doc[start_tok - 1].pos_ == 'ADP':
            yield 'EXPERIENCE', start_tok, start_tok + len(np)


In [32]:
show_extraction(ads[10:15], extract_adp_experience_2)

In [33]:
%time 
ent_adp_df = extract_df(extract_adp_experience, n_max=50)


CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.1 µs


In [34]:
%time 
ent_adp_df = extract_df(extract_adp_experience_2, n_max=50)


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs


In [None]:
%time 
ent_adp_df = extract_df(extract_adp_experience, n_max=50000)


CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.86 µs


In [None]:
aggregate_df(ent_adp_df).head(50)


In [None]:
showent_df(ent_adp_df.query("text=='C'").head(5))


In [None]:
showent_df(ent_adp_df.query("text=='R'").head(5))


In [None]:
def highlight_text_context(terms, texts, n_before=1, n_after=2):
    context = []
    for doc in nlp.pipe(texts):
        sentences = list(doc.sents)
        idxs = [i for i, sent in enumerate(sentences) if any(term in sent.text.lower() for term in terms)]
        
        for idx in idxs:
            before = ''.join(sent.text for sent in sentences[max(idx-n_before, 0):idx])
            after = ''.join(sent.text for sent in sentences[idx+1:min(idx+n_before+1, len(sentences))])
            text = sentences[idx].text
            markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>',
                                 f'<span style="color:blue">{text}</span>')
            display(HTML(before + markup + after))

In [None]:
terms = ['experience']

for _, q in ent_adp_df.query("text=='a'").head(7).iterrows():
    doc = nlp(q.FullDescription)
    if q.sent_start > 0:
        prev_sent = doc[q.sent_start - 1].sent.text
    else:
        prev_sent = ''
    
    if q.sent_end < len(doc):
        next_sent = doc[q.sent_end].sent.text
    else:
        next_sent = ''
        
    text = doc[q.sent_start:q.sent_end].text
    markup = re.sub(fr'(?i)\b({"|".join(terms)})\b', r'<strong>\1</strong>',
                     f'<span style="color:blue">{text}</span>')
    display(HTML(prev_sent + markup + next_sent))

In [None]:
def get_left_span(tok, label='', include=True):
    offset = 1 if include else 0
    idx = tok.i
    while idx > tok.left_edge.i:
        if tok.doc[idx - 1].pos_ in ('NOUN', 'PROPN', 'ADJ', 'X'):
            idx -= 1
        else:
            break
    return label, idx, tok.i+offset


In [None]:
def get_conjugations(tok):
    new = [tok]
    while new:
        tok = new.pop()
        yield tok
        for child in tok.children:
            if child.dep_ == 'conj':
                new.append(child)

In [None]:
#old
EXP_TERMS = ['experience']
def extract_adp_conj_experience(doc, label='EXPERIENCE'):
    for tok in doc:
        if tok.lower_ in EXP_TERMS:
            for child in tok.rights:
                if child.dep_ == 'prep':
                    for obj in child.children:
                        if obj.dep_ == 'pobj':
                            for conj in get_conjugations(obj):
                                yield get_left_span(conj, label)

In [None]:
show_extraction(ads[10:15], extract_adp_conj_experience)


In [None]:
show_extraction(ads[:10], extract_adp_conj_experience)


In [None]:
def extract_verb_maybeadj_noun_experience(doc, label='EXPERIENCE'):
    for tok in doc:
        if tok.lower_ in EXP_TERMS:
            for child in tok.rights:
                if child.dep_ == 'acl':
                    for gc in child.children:
                        if gc.dep_ == 'prep':
                            for ggc in gc.children:
                                if ggc.dep_ == 'pobj':
                                    for c in get_conjugations(ggc):
                                        yield get_left_span(c, 'EXPERIENCE')
                        elif gc.dep_ == 'dobj':
                            for c in get_conjugations(gc):
                                yield get_left_span(c, 'EXPERIENCE')


In [None]:
show_extraction(ads[5:10], extract_verb_maybeadj_noun_experience)


In [None]:
extract_exps = [extract_adp_conj_experience,]


In [None]:
n_ads = len(df)

In [None]:
%%time
df_ents = extract_df(*extract_exps, n_max=n_ads)

In [None]:
df_ents

In [None]:
showent_df(df_ents[:2])


In [None]:
df_ent_agg = aggregate_df(df_ents)
df_ent_agg.head(10)

In [None]:
from flashtext import KeywordProcessor


In [None]:
keyword_processor = KeywordProcessor(case_sensitive=True)


In [None]:
#selecting most popular
skills = df_ent_agg.query('n_company >= 3').text
len(skills)

In [None]:
for skill in skills:
    keyword_processor.add_keyword(skill)

In [None]:
from collections import Counter


In [None]:
skills = list(
(df_ent_agg
 .query('n_company >= 3')
).text
)
len(skills)

In [None]:
n_max=1000
for a,b,c in zip(skills[:n_max:3],skills[1:n_max:3],skills[2:n_max:3]):
     print('{:<35}{:<35}{:<}'.format(a,b,c))

In [None]:
def get_extractions_2(examples, *extractors):
    # Could use context instead of enumerate
    doc = nlp(examples, disable=['ner'])
    for ent in filter_spans([Span(doc, start, end, label) for extractor in extractors for label, start, end in extractor(doc)]):
        sent = ent.root.sent
        yield ent.text



In [None]:
def list_skills(examples, *extractors):
    return list(get_extractions_2(examples, *extractors))

In [None]:
print(list_skills(df['Job Description'][10], extract_adp_conj_experience))

In [None]:
#for debugging visually
print(show_extraction(df['Job Description'][10]), extract_adp_conj_experience)

In [None]:
%%time
#create a new column in df with the fxn that works
df['skills'] = df['Job Description'].apply(lambda x: list_skills(x, extract_adp_conj_experience)) 


In [None]:
df.head()

In [None]:
df.sample(50, random_state = 5)

In [None]:
with open('outputs/raw_skills.txt', 'w') as f:
    for skill in skills:
        print(skill, file=f)

In [None]:
df.to_csv('outputs/rule_based_extracted_skills.csv', index=False)
